Spaces:

ajsbsd
/

smollm2-zerocpu-demo

Running

App Files Files Community

ajsbsd commited on Jun 16

Commit

2df6b47

verified ·

1 Parent(s): 4e65a82

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -10

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-import torchdo_s
 import os
 import time
@@ -26,7 +26,7 @@ MAX_NEW_TOKENS = 256
 TEMPERATURE = 0.7
 TOP_K = 50
 TOP_P = 0.95
-DO_SAMPLE = True
 # Global model and tokenizer
 model = None
@@ -102,15 +102,13 @@ def predict_chat(message: str, history: list):
         prompt_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         try:
-            # The do_sample parameter should be passed directly, not as part of the prompt string
-            # Also, 'stream=True' is crucial for token-by-token output in Gradio
             for token in model(
                 prompt_input,
                 max_new_tokens=MAX_NEW_TOKENS,
                 temperature=TEMPERATURE,
                 top_k=TOP_K,
                 top_p=TOP_P,
-                #do_sample=DO_SAMPLE, # Corrected parameter passing
                 repetition_penalty=1.1,
                 stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
                 stream=True
@@ -127,7 +125,6 @@ def predict_chat(message: str, history: list):
                 temperature=TEMPERATURE,
                 top_k=TOP_K,
                 top_p=TOP_P,
-                #do_sample=DO_SAMPLE, # Corrected parameter passing
                 repetition_penalty=1.1,
                 stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
             )
@@ -145,15 +142,13 @@ def predict_chat(message: str, history: list):
         # in the same way ctransformers does directly. For true streaming with HF models,
         # you'd often need a custom generation loop or a specific streaming API.
         # For this example, we'll generate the full response and then yield it.
-        # If true token-by-token streaming is critical for the HF model,
-        # you might need to adjust this part or use a different model.
         outputs = model.generate(
             inputs,
             max_length=inputs.shape[-1] + MAX_NEW_TOKENS,
             temperature=TEMPERATURE,
             top_k=TOP_K,
             top_p=TOP_P,
-            #do_sample=DO_SAMPLE, # Uncommented for use
             pad_token_id=tokenizer.pad_token_id
         )
         generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
@@ -199,4 +194,4 @@ if __name__ == "__main__":
     demo.chatbot.value = initial_messages_for_value
-    demo.launch()

 import gradio as gr
+import torch
 import os
 import time
 TEMPERATURE = 0.7
 TOP_K = 50
 TOP_P = 0.95
+DO_SAMPLE = True # This parameter is primarily for Hugging Face transformers.Model.generate()
 # Global model and tokenizer
 model = None
         prompt_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         try:
+            # Removed do_sample as it's not accepted by ctransformers.LLM.__call__()
             for token in model(
                 prompt_input,
                 max_new_tokens=MAX_NEW_TOKENS,
                 temperature=TEMPERATURE,
                 top_k=TOP_K,
                 top_p=TOP_P,
                 repetition_penalty=1.1,
                 stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
                 stream=True
                 temperature=TEMPERATURE,
                 top_k=TOP_K,
                 top_p=TOP_P,
                 repetition_penalty=1.1,
                 stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
             )
         # in the same way ctransformers does directly. For true streaming with HF models,
         # you'd often need a custom generation loop or a specific streaming API.
         # For this example, we'll generate the full response and then yield it.
         outputs = model.generate(
             inputs,
             max_length=inputs.shape[-1] + MAX_NEW_TOKENS,
             temperature=TEMPERATURE,
             top_k=TOP_K,
             top_p=TOP_P,
+            do_sample=DO_SAMPLE, # Uncommented for use
             pad_token_id=tokenizer.pad_token_id
         )
         generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
     demo.chatbot.value = initial_messages_for_value
+    demo.launch()