Spaces:

padmanabhbosamia
/

Phi2_Qlora

Running

App Files Files Community

padmanabhbosamia commited on Mar 4

Commit

2eb6820

verified ·

1 Parent(s): 14f2b83

Upload app.py

Browse files

Updated cuda error

Files changed (1) hide show

app.py +10 -11

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 import torch
 import random
 import time
@@ -10,9 +10,8 @@ model_path = "./phi2-qlora-final"
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 model = AutoModelForCausalLM.from_pretrained(
     model_path,
-    device_map="auto",
-    load_in_8bit=True,  # Use 8-bit quantization instead of 4-bit
-    torch_dtype=torch.float16,
     trust_remote_code=True
 )
@@ -57,7 +56,7 @@ def generate_response(prompt, max_length=512, temperature=0.7, top_p=0.9, top_k=
         if not prompt.strip():
             return "Please enter a prompt."
-        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
         with torch.no_grad():  # Disable gradient computation for inference
             outputs = model.generate(
@@ -93,16 +92,16 @@ example_prompts = [
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as iface:
     gr.Markdown(
         """
-        # 🤖 Phi-2 QLoRA Chat Interface
-        Chat with the fine-tuned Phi-2 model using QLoRA. Adjust the parameters below to control the generation.
         """,
         elem_classes="title"
     )
     gr.Markdown(
         """
-        This interface allows you to interact with a fine-tuned Phi-2 model. You can adjust various parameters to control the generation process.
         """,
         elem_classes="description"
     )
@@ -123,8 +122,8 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as iface:
                 with gr.Row():
                     max_length = gr.Slider(
                         minimum=64,
-                        maximum=1024,
-                        value=512,
                         step=64,
                         label="Max Length",
                         info="Maximum length of generated response"
@@ -187,7 +186,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as iface:
     gr.Markdown(
         """
         ---
-        Made with ❤️ using Phi-2 and QLoRA
         """,
         elem_classes="footer"
     )

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import random
 import time
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 model = AutoModelForCausalLM.from_pretrained(
     model_path,
+    device_map="cpu",  # Force CPU usage
+    torch_dtype=torch.float32,  # Use float32 for CPU
     trust_remote_code=True
 )
         if not prompt.strip():
             return "Please enter a prompt."
+        inputs = tokenizer(prompt, return_tensors="pt")
         with torch.no_grad():  # Disable gradient computation for inference
             outputs = model.generate(
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as iface:
     gr.Markdown(
         """
+        # 🤖 Phi-2 QLoRA Chat Interface (CPU Version)
+        Chat with the fine-tuned Phi-2 model using QLoRA. This version runs on CPU for better compatibility.
         """,
         elem_classes="title"
     )
     gr.Markdown(
         """
+        This interface allows you to interact with a fine-tuned Phi-2 model. Note that responses may be slower due to CPU-only inference.
         """,
         elem_classes="description"
     )
                 with gr.Row():
                     max_length = gr.Slider(
                         minimum=64,
+                        maximum=512,  # Reduced max length for CPU
+                        value=256,    # Reduced default length
                         step=64,
                         label="Max Length",
                         info="Maximum length of generated response"
     gr.Markdown(
         """
         ---
+        Made with ❤️ using Phi-2 and QLoRA (CPU Version)
         """,
         elem_classes="footer"
     )