Spaces:

michailroussos
/

ID2223_9D_withGPU

Runtime error

App Files Files Community

michailroussos commited on Dec 9, 2024

Commit

9202d9a

1 Parent(s): 15bfa4e

Browse files

Files changed (1) hide show

app.py +15 -19

app.py CHANGED Viewed

@@ -20,15 +20,14 @@ FastLanguageModel.for_inference(model)  # Enable faster inference
 print("Model loaded successfully!")
 # Gradio Response Function
 def respond(message, max_new_tokens, temperature, system_message=""):
     try:
         # Prepare input messages
         messages = [{"role": "system", "content": system_message}] if system_message else []
         messages.append({"role": "user", "content": message})
-        # Debug: Show messages
-        print("[DEBUG] Messages:", messages)
         # Tokenize inputs
         input_ids = tokenizer.apply_chat_template(
             messages,
@@ -37,37 +36,34 @@ def respond(message, max_new_tokens, temperature, system_message=""):
             return_tensors="pt",
         ).to("cuda")
-        # Debug: Inspect input tensor
-        print("[DEBUG] input_ids:", input_ids)
         # Ensure the input tensor has the correct dimensions
         if input_ids.dim() != 2:
             raise ValueError(f"`input_ids` must be a 2D tensor. Found shape: {input_ids.shape}")
-        # Stream response
-        text_streamer = TextStreamer(tokenizer, skip_prompt=True)
-        model.generate(
-            input_ids=input_ids,
-            max_new_tokens=max_new_tokens,
-            temperature=temperature,
-            use_cache=True,
-            streamer=text_streamer,
-        )
-        # Get the response generated by the model
-        # Retrieve text from the output stream (assuming this works with your setup)
-        generated_text = text_streamer.generated_text  # This assumes the `TextStreamer` accumulates the generated text
-        # Debug: Show the response text
         print("[DEBUG] Generated Text:", generated_text)
         return generated_text
     except Exception as e:
         # Debug: Log errors
         print("[ERROR]", str(e))
         return f"Error: {str(e)}"
 # Gradio UI
 demo = gr.Interface(
     fn=respond,

 print("Model loaded successfully!")
 # Gradio Response Function
+from transformers import TextStreamer
 def respond(message, max_new_tokens, temperature, system_message=""):
     try:
         # Prepare input messages
         messages = [{"role": "system", "content": system_message}] if system_message else []
         messages.append({"role": "user", "content": message})
         # Tokenize inputs
         input_ids = tokenizer.apply_chat_template(
             messages,
             return_tensors="pt",
         ).to("cuda")
         # Ensure the input tensor has the correct dimensions
         if input_ids.dim() != 2:
             raise ValueError(f"`input_ids` must be a 2D tensor. Found shape: {input_ids.shape}")
+        # Generate output directly
+        with torch.no_grad():  # No need to track gradients for inference
+            output = model.generate(
+                input_ids=input_ids,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                use_cache=True,
+            )
+        # Decode the generated tokens back to text
+        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
+        # Debug: Show the generated text
         print("[DEBUG] Generated Text:", generated_text)
         return generated_text
     except Exception as e:
         # Debug: Log errors
         print("[ERROR]", str(e))
         return f"Error: {str(e)}"
 # Gradio UI
 demo = gr.Interface(
     fn=respond,

more