Spaces:
Runtime error
Runtime error
Upload app.py
Browse filesUpdated cuda error
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 3 |
import torch
|
| 4 |
import random
|
| 5 |
import time
|
|
@@ -10,9 +10,8 @@ model_path = "./phi2-qlora-final"
|
|
| 10 |
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
| 11 |
model = AutoModelForCausalLM.from_pretrained(
|
| 12 |
model_path,
|
| 13 |
-
device_map="
|
| 14 |
-
|
| 15 |
-
torch_dtype=torch.float16,
|
| 16 |
trust_remote_code=True
|
| 17 |
)
|
| 18 |
|
|
@@ -57,7 +56,7 @@ def generate_response(prompt, max_length=512, temperature=0.7, top_p=0.9, top_k=
|
|
| 57 |
if not prompt.strip():
|
| 58 |
return "Please enter a prompt."
|
| 59 |
|
| 60 |
-
inputs = tokenizer(prompt, return_tensors="pt")
|
| 61 |
|
| 62 |
with torch.no_grad(): # Disable gradient computation for inference
|
| 63 |
outputs = model.generate(
|
|
@@ -93,16 +92,16 @@ example_prompts = [
|
|
| 93 |
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as iface:
|
| 94 |
gr.Markdown(
|
| 95 |
"""
|
| 96 |
-
# 🤖 Phi-2 QLoRA Chat Interface
|
| 97 |
|
| 98 |
-
Chat with the fine-tuned Phi-2 model using QLoRA.
|
| 99 |
""",
|
| 100 |
elem_classes="title"
|
| 101 |
)
|
| 102 |
|
| 103 |
gr.Markdown(
|
| 104 |
"""
|
| 105 |
-
This interface allows you to interact with a fine-tuned Phi-2 model.
|
| 106 |
""",
|
| 107 |
elem_classes="description"
|
| 108 |
)
|
|
@@ -123,8 +122,8 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as iface:
|
|
| 123 |
with gr.Row():
|
| 124 |
max_length = gr.Slider(
|
| 125 |
minimum=64,
|
| 126 |
-
maximum=
|
| 127 |
-
value=
|
| 128 |
step=64,
|
| 129 |
label="Max Length",
|
| 130 |
info="Maximum length of generated response"
|
|
@@ -187,7 +186,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as iface:
|
|
| 187 |
gr.Markdown(
|
| 188 |
"""
|
| 189 |
---
|
| 190 |
-
Made with ❤️ using Phi-2 and QLoRA
|
| 191 |
""",
|
| 192 |
elem_classes="footer"
|
| 193 |
)
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 3 |
import torch
|
| 4 |
import random
|
| 5 |
import time
|
|
|
|
| 10 |
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
| 11 |
model = AutoModelForCausalLM.from_pretrained(
|
| 12 |
model_path,
|
| 13 |
+
device_map="cpu", # Force CPU usage
|
| 14 |
+
torch_dtype=torch.float32, # Use float32 for CPU
|
|
|
|
| 15 |
trust_remote_code=True
|
| 16 |
)
|
| 17 |
|
|
|
|
| 56 |
if not prompt.strip():
|
| 57 |
return "Please enter a prompt."
|
| 58 |
|
| 59 |
+
inputs = tokenizer(prompt, return_tensors="pt")
|
| 60 |
|
| 61 |
with torch.no_grad(): # Disable gradient computation for inference
|
| 62 |
outputs = model.generate(
|
|
|
|
| 92 |
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as iface:
|
| 93 |
gr.Markdown(
|
| 94 |
"""
|
| 95 |
+
# 🤖 Phi-2 QLoRA Chat Interface (CPU Version)
|
| 96 |
|
| 97 |
+
Chat with the fine-tuned Phi-2 model using QLoRA. This version runs on CPU for better compatibility.
|
| 98 |
""",
|
| 99 |
elem_classes="title"
|
| 100 |
)
|
| 101 |
|
| 102 |
gr.Markdown(
|
| 103 |
"""
|
| 104 |
+
This interface allows you to interact with a fine-tuned Phi-2 model. Note that responses may be slower due to CPU-only inference.
|
| 105 |
""",
|
| 106 |
elem_classes="description"
|
| 107 |
)
|
|
|
|
| 122 |
with gr.Row():
|
| 123 |
max_length = gr.Slider(
|
| 124 |
minimum=64,
|
| 125 |
+
maximum=512, # Reduced max length for CPU
|
| 126 |
+
value=256, # Reduced default length
|
| 127 |
step=64,
|
| 128 |
label="Max Length",
|
| 129 |
info="Maximum length of generated response"
|
|
|
|
| 186 |
gr.Markdown(
|
| 187 |
"""
|
| 188 |
---
|
| 189 |
+
Made with ❤️ using Phi-2 and QLoRA (CPU Version)
|
| 190 |
""",
|
| 191 |
elem_classes="footer"
|
| 192 |
)
|