Spaces:

michailroussos
/

ID2223_9D_withGPU

Runtime error

App Files Files Community

michailroussos commited on Dec 9, 2024

Commit

3dc2f1d

1 Parent(s): 35ddf38

Browse files

Files changed (1) hide show

app.py +38 -98

app.py CHANGED Viewed

@@ -1,124 +1,64 @@
 import gradio as gr
 from unsloth import FastLanguageModel
-import torch
-# Load the model and tokenizer locally
 max_seq_length = 2048
 model_name_or_path = "michailroussos/model_llama_8d"
-# Load model and tokenizer using unsloth
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name=model_name_or_path,
     max_seq_length=max_seq_length,
     load_in_4bit=True,
 )
-FastLanguageModel.for_inference(model)  # Enable optimized inference
-# Define the response function
-def respond(message, history, system_message, max_tokens, temperature, top_p):
-    print("\n" + "="*50)
-    print("===== RESPOND FUNCTION CALLED =====")
-    print("="*50)
-    # Print input parameters
-    print(f"Input Message: {message}")
-    print(f"System Message: {system_message}")
-    print(f"Max Tokens: {max_tokens}")
-    print(f"Temperature: {temperature}")
-    print(f"Top-p: {top_p}")
-    # Debug history
-    print("\n--- Current History ---")
-    print(f"History Type: {type(history)}")
-    print(f"History Content: {history}")
-    # Ensure history is formatted as a list of dictionaries
-    messages = [{"role": "system", "content": system_message}]  # Add system message at the start
     try:
-        if history:
-            print("\n--- Processing Existing History ---")
-            for entry in history:
-                # Ensure each history entry is in the correct format
-                if isinstance(entry, dict) and 'role' in entry and 'content' in entry:
-                    messages.append(entry)
-                else:
-                    print(f"Skipping malformed history entry: {entry}")
-        # Add the current user message
-        print("\n--- Adding Current Message ---")
-        messages.append({"role": "user", "content": message})
-        # Debug messages before tokenization
-        print("\n--- Messages Before Tokenization ---")
-        for msg in messages:
-            print(f"Role: {msg['role']}, Content: {msg['content'][:100]}...")
-        # Tokenize the input
-        print("\n--- Tokenizing Input ---")
         inputs = tokenizer.apply_chat_template(
             messages,
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt",
-        ).to("cuda" if torch.cuda.is_available() else "cpu")
-        print(f"Tokenized Inputs Shape: {inputs.shape}")
-        print(f"Tokenized Inputs Device: {inputs.device}")
         # Generate response
-        attention_mask = inputs.ne(tokenizer.pad_token_id).long()
-        try:
-            generated_tokens = model.generate(
-                input_ids=inputs,
-                attention_mask=attention_mask,
-                max_new_tokens=max_tokens,
-                use_cache=True,
-                temperature=temperature,
-                top_p=top_p,
-            )
-            # Decode the generated response
-            response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
-            print("\n--- Generated Response ---")
-            print(f"Raw Response: {response}")
-            # Prepare return history in OpenAI messages format
-            return_messages = [{"role": "user", "content": message},
-                               {"role": "assistant", "content": response}]
-            # Add previous conversation turns if any
-            for entry in (history or []):
-                return_messages.insert(0, {"role": entry['role'], "content": entry['content']})
-            print("\n--- Return Messages ---")
-            for msg in return_messages:
-                print(f"Role: {msg['role']}, Content: {msg['content'][:100]}...")
-            return return_messages
-        except Exception as gen_error:
-            print("\n--- GENERATION ERROR ---")
-            print(f"Error during model generation: {gen_error}")
-            return []
-    except Exception as prep_error:
-        print("\n--- PREPARATION ERROR ---")
-        print(f"Error during message preparation: {prep_error}")
-        return []
-# Define the Gradio interface
 demo = gr.ChatInterface(
-    fn=respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a helpful assistant.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
-    ],
-    type="messages"  # Explicitly set to messages type
 )
 if __name__ == "__main__":
-    demo.launch(share=False)  # Use share=False for local testing

 import gradio as gr
+from transformers import TextStreamer
 from unsloth import FastLanguageModel
+# Define constants
 max_seq_length = 2048
+dtype = None
 model_name_or_path = "michailroussos/model_llama_8d"
+# Load the model and tokenizer
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name=model_name_or_path,
     max_seq_length=max_seq_length,
+    dtype=dtype,
     load_in_4bit=True,
 )
+# Optimize model for inference
+FastLanguageModel.for_inference(model)
+# Function to generate a response
+def chat_with_model(user_message, chat_history=None):
     try:
+        # Prepare the input messages
+        messages = [{"role": "user", "content": user_message}]
+        # Tokenize and prepare inputs for the model
         inputs = tokenizer.apply_chat_template(
             messages,
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt",
+        ).to("cuda")
         # Generate response
+        text_streamer = TextStreamer(tokenizer, skip_prompt=True)
+        output = model.generate(
+            input_ids=inputs,
+            streamer=text_streamer,
+            max_new_tokens=128,
+            use_cache=True,
+            temperature=1.5,
+            min_p=0.1,
+        )
+        # Append the response to the chat history
+        if chat_history is None:
+            chat_history = []
+        chat_history.append((user_message, output))
+        return "", chat_history
+    except Exception as e:
+        return f"Error: {str(e)}", chat_history
+# Create the chat interface
 demo = gr.ChatInterface(
+    fn=chat_with_model,
+    chatbot=gr.Chatbot(label="Chat with Hugging Face Model"),
+    title="Hugging Face Chat Model",
+    description="Chat with a Hugging Face model using FastLanguageModel.",
 )
+# Launch the app
 if __name__ == "__main__":
+    demo.launch()

more