ajsbsd commited on
Commit
3de9a17
·
verified ·
1 Parent(s): fe3c5c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -10
app.py CHANGED
@@ -46,6 +46,8 @@ def load_model_for_zerocpu():
46
  model_type="llama",
47
  gpu_layers=0
48
  )
 
 
49
  tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
50
  if tokenizer.pad_token is None:
51
  tokenizer.pad_token = tokenizer.eos_token
@@ -79,16 +81,36 @@ def predict_chat(message: str, history: list):
79
  yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
80
  return
81
 
82
- messages = [{"role": "system", "content": "You are a friendly chatbot."}] + history
 
 
 
 
 
83
  messages.append({"role": "user", "content": message})
84
 
85
  generated_text = ""
86
  start_time = time.time()
87
 
88
- # CORRECTED: Check against ctransformers.llm.LLM directly
89
  if GGUF_AVAILABLE and isinstance(model, LLM):
90
  print("Using GGUF model generation path.")
91
- prompt_input Edo_sampledo_sample=DO_SAMPLE,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  repetition_penalty=1.1,
93
  stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
94
  stream=True
@@ -96,20 +118,21 @@ def predict_chat(message: str, history: list):
96
  generated_text += token
97
  yield generated_text
98
  except Exception as e:
99
- print(f"Error in GGUF generation: {e}")
100
- # Fallback to non-streaming generation
 
101
  output = model(
102
  prompt_input,
103
  max_new_tokens=MAX_NEW_TOKENS,
104
  temperature=TEMPERATURE,
105
  top_k=TOP_K,
106
  top_p=TOP_P,
107
- #do_sample=DO_SAMPLE,
108
  repetition_penalty=1.1,
109
  stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
110
  )
111
- yield output
112
- generated_text += token
113
  yield generated_text
114
 
115
  else:
@@ -117,18 +140,25 @@ def predict_chat(message: str, history: list):
117
  input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
118
  inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
119
 
 
 
 
 
 
 
 
120
  outputs = model.generate(
121
  inputs,
122
  max_length=inputs.shape[-1] + MAX_NEW_TOKENS,
123
  temperature=TEMPERATURE,
124
  top_k=TOP_K,
125
  top_p=TOP_P,
126
- #do_sample=DO_SAMPLE,
127
  pad_token_id=tokenizer.pad_token_id
128
  )
129
  generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
130
  yield generated_text
131
-
132
  end_time = time.time()
133
  print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
134
 
 
46
  model_type="llama",
47
  gpu_layers=0
48
  )
49
+ # For ctransformers models, the tokenizer is often separate, or not strictly needed for basic chat templates
50
+ # We use the original model's tokenizer for consistency and template application.
51
  tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
52
  if tokenizer.pad_token is None:
53
  tokenizer.pad_token = tokenizer.eos_token
 
81
  yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
82
  return
83
 
84
+ # Gradio history is already formatted as a list of lists: [[user_msg, bot_msg], ...]
85
+ # We need to convert it to the format expected by the tokenizer's chat template.
86
+ messages = [{"role": "system", "content": "You are a friendly chatbot."}]
87
+ for human, assistant in history:
88
+ messages.append({"role": "user", "content": human})
89
+ messages.append({"role": "assistant", "content": assistant})
90
  messages.append({"role": "user", "content": message})
91
 
92
  generated_text = ""
93
  start_time = time.time()
94
 
95
+ # CORRECTED: Check against ctransformers.llm.LLM directly and ensure parameters are correct
96
  if GGUF_AVAILABLE and isinstance(model, LLM):
97
  print("Using GGUF model generation path.")
98
+ # Apply chat template for GGUF models as well,
99
+ # though ctransformers might expect a simpler string.
100
+ # This can be adjusted if the model has a specific prompt format.
101
+ # For Llama-based models, the tokenizer.apply_chat_template should work.
102
+ prompt_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
103
+
104
+ try:
105
+ # The do_sample parameter should be passed directly, not as part of the prompt string
106
+ # Also, 'stream=True' is crucial for token-by-token output in Gradio
107
+ for token in model(
108
+ prompt_input,
109
+ max_new_tokens=MAX_NEW_TOKENS,
110
+ temperature=TEMPERATURE,
111
+ top_k=TOP_K,
112
+ top_p=TOP_P,
113
+ do_sample=DO_SAMPLE, # Corrected parameter passing
114
  repetition_penalty=1.1,
115
  stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
116
  stream=True
 
118
  generated_text += token
119
  yield generated_text
120
  except Exception as e:
121
+ print(f"Error in GGUF streaming generation: {e}")
122
+ # Fallback to non-streaming generation if streaming fails
123
+ # Ensure the output is processed correctly
124
  output = model(
125
  prompt_input,
126
  max_new_tokens=MAX_NEW_TOKENS,
127
  temperature=TEMPERATURE,
128
  top_k=TOP_K,
129
  top_p=TOP_P,
130
+ do_sample=DO_SAMPLE, # Corrected parameter passing
131
  repetition_penalty=1.1,
132
  stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
133
  )
134
+ # If not streaming, the 'output' is the complete string
135
+ generated_text = output
136
  yield generated_text
137
 
138
  else:
 
140
  input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
141
  inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
142
 
143
+ # Using stream=True for Hugging Face generation with yield for Gradio
144
+ # Note: `model.generate` for Hugging Face `transformers` typically doesn't stream token by token
145
+ # in the same way ctransformers does directly. For true streaming with HF models,
146
+ # you'd often need a custom generation loop or a specific streaming API.
147
+ # For this example, we'll generate the full response and then yield it.
148
+ # If true token-by-token streaming is critical for the HF model,
149
+ # you might need to adjust this part or use a different model.
150
  outputs = model.generate(
151
  inputs,
152
  max_length=inputs.shape[-1] + MAX_NEW_TOKENS,
153
  temperature=TEMPERATURE,
154
  top_k=TOP_K,
155
  top_p=TOP_P,
156
+ do_sample=DO_SAMPLE, # Uncommented for use
157
  pad_token_id=tokenizer.pad_token_id
158
  )
159
  generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
160
  yield generated_text
161
+
162
  end_time = time.time()
163
  print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
164