Spaces:

ajsbsd
/

smollm2-zerocpu-demo

Running

App Files Files Community

ajsbsd commited on Jun 16

Commit

3de9a17

verified ·

1 Parent(s): fe3c5c3

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -10

app.py CHANGED Viewed

@@ -46,6 +46,8 @@ def load_model_for_zerocpu():
                 model_type="llama",
                 gpu_layers=0
             )
             tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
             if tokenizer.pad_token is None:
                 tokenizer.pad_token = tokenizer.eos_token
@@ -79,16 +81,36 @@ def predict_chat(message: str, history: list):
         yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
         return
-    messages = [{"role": "system", "content": "You are a friendly chatbot."}] + history
     messages.append({"role": "user", "content": message})
     generated_text = ""
     start_time = time.time()
-    # CORRECTED: Check against ctransformers.llm.LLM directly
     if GGUF_AVAILABLE and isinstance(model, LLM):
         print("Using GGUF model generation path.")
-        prompt_input Edo_sampledo_sample=DO_SAMPLE,
                 repetition_penalty=1.1,
                 stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
                 stream=True
@@ -96,20 +118,21 @@ def predict_chat(message: str, history: list):
                 generated_text += token
                 yield generated_text
         except Exception as e:
-            print(f"Error in GGUF generation: {e}")
-            # Fallback to non-streaming generation
             output = model(
                 prompt_input,
                 max_new_tokens=MAX_NEW_TOKENS,
                 temperature=TEMPERATURE,
                 top_k=TOP_K,
                 top_p=TOP_P,
-                #do_sample=DO_SAMPLE,
                 repetition_penalty=1.1,
                 stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
             )
-            yield output
-            generated_text += token
             yield generated_text
     else:
@@ -117,18 +140,25 @@ def predict_chat(message: str, history: list):
         input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
         outputs = model.generate(
             inputs,
             max_length=inputs.shape[-1] + MAX_NEW_TOKENS,
             temperature=TEMPERATURE,
             top_k=TOP_K,
             top_p=TOP_P,
-            #do_sample=DO_SAMPLE,
             pad_token_id=tokenizer.pad_token_id
         )
         generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
         yield generated_text
     end_time = time.time()
     print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")

                 model_type="llama",
                 gpu_layers=0
             )
+            # For ctransformers models, the tokenizer is often separate, or not strictly needed for basic chat templates
+            # We use the original model's tokenizer for consistency and template application.
             tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
             if tokenizer.pad_token is None:
                 tokenizer.pad_token = tokenizer.eos_token
         yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
         return
+    # Gradio history is already formatted as a list of lists: [[user_msg, bot_msg], ...]
+    # We need to convert it to the format expected by the tokenizer's chat template.
+    messages = [{"role": "system", "content": "You are a friendly chatbot."}]
+    for human, assistant in history:
+        messages.append({"role": "user", "content": human})
+        messages.append({"role": "assistant", "content": assistant})
     messages.append({"role": "user", "content": message})
     generated_text = ""
     start_time = time.time()
+    # CORRECTED: Check against ctransformers.llm.LLM directly and ensure parameters are correct
     if GGUF_AVAILABLE and isinstance(model, LLM):
         print("Using GGUF model generation path.")
+        # Apply chat template for GGUF models as well,
+        # though ctransformers might expect a simpler string.
+        # This can be adjusted if the model has a specific prompt format.
+        # For Llama-based models, the tokenizer.apply_chat_template should work.
+        prompt_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        try:
+            # The do_sample parameter should be passed directly, not as part of the prompt string
+            # Also, 'stream=True' is crucial for token-by-token output in Gradio
+            for token in model(
+                prompt_input,
+                max_new_tokens=MAX_NEW_TOKENS,
+                temperature=TEMPERATURE,
+                top_k=TOP_K,
+                top_p=TOP_P,
+                do_sample=DO_SAMPLE, # Corrected parameter passing
                 repetition_penalty=1.1,
                 stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
                 stream=True
                 generated_text += token
                 yield generated_text
         except Exception as e:
+            print(f"Error in GGUF streaming generation: {e}")
+            # Fallback to non-streaming generation if streaming fails
+            # Ensure the output is processed correctly
             output = model(
                 prompt_input,
                 max_new_tokens=MAX_NEW_TOKENS,
                 temperature=TEMPERATURE,
                 top_k=TOP_K,
                 top_p=TOP_P,
+                do_sample=DO_SAMPLE, # Corrected parameter passing
                 repetition_penalty=1.1,
                 stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
             )
+            # If not streaming, the 'output' is the complete string
+            generated_text = output
             yield generated_text
     else:
         input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
+        # Using stream=True for Hugging Face generation with yield for Gradio
+        # Note: `model.generate` for Hugging Face `transformers` typically doesn't stream token by token
+        # in the same way ctransformers does directly. For true streaming with HF models,
+        # you'd often need a custom generation loop or a specific streaming API.
+        # For this example, we'll generate the full response and then yield it.
+        # If true token-by-token streaming is critical for the HF model,
+        # you might need to adjust this part or use a different model.
         outputs = model.generate(
             inputs,
             max_length=inputs.shape[-1] + MAX_NEW_TOKENS,
             temperature=TEMPERATURE,
             top_k=TOP_K,
             top_p=TOP_P,
+            do_sample=DO_SAMPLE, # Uncommented for use
             pad_token_id=tokenizer.pad_token_id
         )
         generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
         yield generated_text
     end_time = time.time()
     print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")