Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -46,6 +46,8 @@ def load_model_for_zerocpu():
|
|
46 |
model_type="llama",
|
47 |
gpu_layers=0
|
48 |
)
|
|
|
|
|
49 |
tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
|
50 |
if tokenizer.pad_token is None:
|
51 |
tokenizer.pad_token = tokenizer.eos_token
|
@@ -79,16 +81,36 @@ def predict_chat(message: str, history: list):
|
|
79 |
yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
|
80 |
return
|
81 |
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
83 |
messages.append({"role": "user", "content": message})
|
84 |
|
85 |
generated_text = ""
|
86 |
start_time = time.time()
|
87 |
|
88 |
-
# CORRECTED: Check against ctransformers.llm.LLM directly
|
89 |
if GGUF_AVAILABLE and isinstance(model, LLM):
|
90 |
print("Using GGUF model generation path.")
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
repetition_penalty=1.1,
|
93 |
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
|
94 |
stream=True
|
@@ -96,20 +118,21 @@ def predict_chat(message: str, history: list):
|
|
96 |
generated_text += token
|
97 |
yield generated_text
|
98 |
except Exception as e:
|
99 |
-
print(f"Error in GGUF generation: {e}")
|
100 |
-
# Fallback to non-streaming generation
|
|
|
101 |
output = model(
|
102 |
prompt_input,
|
103 |
max_new_tokens=MAX_NEW_TOKENS,
|
104 |
temperature=TEMPERATURE,
|
105 |
top_k=TOP_K,
|
106 |
top_p=TOP_P,
|
107 |
-
|
108 |
repetition_penalty=1.1,
|
109 |
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
|
110 |
)
|
111 |
-
|
112 |
-
generated_text
|
113 |
yield generated_text
|
114 |
|
115 |
else:
|
@@ -117,18 +140,25 @@ def predict_chat(message: str, history: list):
|
|
117 |
input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
118 |
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
outputs = model.generate(
|
121 |
inputs,
|
122 |
max_length=inputs.shape[-1] + MAX_NEW_TOKENS,
|
123 |
temperature=TEMPERATURE,
|
124 |
top_k=TOP_K,
|
125 |
top_p=TOP_P,
|
126 |
-
|
127 |
pad_token_id=tokenizer.pad_token_id
|
128 |
)
|
129 |
generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
|
130 |
yield generated_text
|
131 |
-
|
132 |
end_time = time.time()
|
133 |
print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
|
134 |
|
|
|
46 |
model_type="llama",
|
47 |
gpu_layers=0
|
48 |
)
|
49 |
+
# For ctransformers models, the tokenizer is often separate, or not strictly needed for basic chat templates
|
50 |
+
# We use the original model's tokenizer for consistency and template application.
|
51 |
tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
|
52 |
if tokenizer.pad_token is None:
|
53 |
tokenizer.pad_token = tokenizer.eos_token
|
|
|
81 |
yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
|
82 |
return
|
83 |
|
84 |
+
# Gradio history is already formatted as a list of lists: [[user_msg, bot_msg], ...]
|
85 |
+
# We need to convert it to the format expected by the tokenizer's chat template.
|
86 |
+
messages = [{"role": "system", "content": "You are a friendly chatbot."}]
|
87 |
+
for human, assistant in history:
|
88 |
+
messages.append({"role": "user", "content": human})
|
89 |
+
messages.append({"role": "assistant", "content": assistant})
|
90 |
messages.append({"role": "user", "content": message})
|
91 |
|
92 |
generated_text = ""
|
93 |
start_time = time.time()
|
94 |
|
95 |
+
# CORRECTED: Check against ctransformers.llm.LLM directly and ensure parameters are correct
|
96 |
if GGUF_AVAILABLE and isinstance(model, LLM):
|
97 |
print("Using GGUF model generation path.")
|
98 |
+
# Apply chat template for GGUF models as well,
|
99 |
+
# though ctransformers might expect a simpler string.
|
100 |
+
# This can be adjusted if the model has a specific prompt format.
|
101 |
+
# For Llama-based models, the tokenizer.apply_chat_template should work.
|
102 |
+
prompt_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
103 |
+
|
104 |
+
try:
|
105 |
+
# The do_sample parameter should be passed directly, not as part of the prompt string
|
106 |
+
# Also, 'stream=True' is crucial for token-by-token output in Gradio
|
107 |
+
for token in model(
|
108 |
+
prompt_input,
|
109 |
+
max_new_tokens=MAX_NEW_TOKENS,
|
110 |
+
temperature=TEMPERATURE,
|
111 |
+
top_k=TOP_K,
|
112 |
+
top_p=TOP_P,
|
113 |
+
do_sample=DO_SAMPLE, # Corrected parameter passing
|
114 |
repetition_penalty=1.1,
|
115 |
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
|
116 |
stream=True
|
|
|
118 |
generated_text += token
|
119 |
yield generated_text
|
120 |
except Exception as e:
|
121 |
+
print(f"Error in GGUF streaming generation: {e}")
|
122 |
+
# Fallback to non-streaming generation if streaming fails
|
123 |
+
# Ensure the output is processed correctly
|
124 |
output = model(
|
125 |
prompt_input,
|
126 |
max_new_tokens=MAX_NEW_TOKENS,
|
127 |
temperature=TEMPERATURE,
|
128 |
top_k=TOP_K,
|
129 |
top_p=TOP_P,
|
130 |
+
do_sample=DO_SAMPLE, # Corrected parameter passing
|
131 |
repetition_penalty=1.1,
|
132 |
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
|
133 |
)
|
134 |
+
# If not streaming, the 'output' is the complete string
|
135 |
+
generated_text = output
|
136 |
yield generated_text
|
137 |
|
138 |
else:
|
|
|
140 |
input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
141 |
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
|
142 |
|
143 |
+
# Using stream=True for Hugging Face generation with yield for Gradio
|
144 |
+
# Note: `model.generate` for Hugging Face `transformers` typically doesn't stream token by token
|
145 |
+
# in the same way ctransformers does directly. For true streaming with HF models,
|
146 |
+
# you'd often need a custom generation loop or a specific streaming API.
|
147 |
+
# For this example, we'll generate the full response and then yield it.
|
148 |
+
# If true token-by-token streaming is critical for the HF model,
|
149 |
+
# you might need to adjust this part or use a different model.
|
150 |
outputs = model.generate(
|
151 |
inputs,
|
152 |
max_length=inputs.shape[-1] + MAX_NEW_TOKENS,
|
153 |
temperature=TEMPERATURE,
|
154 |
top_k=TOP_K,
|
155 |
top_p=TOP_P,
|
156 |
+
do_sample=DO_SAMPLE, # Uncommented for use
|
157 |
pad_token_id=tokenizer.pad_token_id
|
158 |
)
|
159 |
generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
|
160 |
yield generated_text
|
161 |
+
|
162 |
end_time = time.time()
|
163 |
print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
|
164 |
|