Spaces:
Sleeping
Sleeping
Upload app.py
Browse filesUpdated cuda error
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
3 |
import torch
|
4 |
import random
|
5 |
import time
|
@@ -10,9 +10,8 @@ model_path = "./phi2-qlora-final"
|
|
10 |
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
11 |
model = AutoModelForCausalLM.from_pretrained(
|
12 |
model_path,
|
13 |
-
device_map="
|
14 |
-
|
15 |
-
torch_dtype=torch.float16,
|
16 |
trust_remote_code=True
|
17 |
)
|
18 |
|
@@ -57,7 +56,7 @@ def generate_response(prompt, max_length=512, temperature=0.7, top_p=0.9, top_k=
|
|
57 |
if not prompt.strip():
|
58 |
return "Please enter a prompt."
|
59 |
|
60 |
-
inputs = tokenizer(prompt, return_tensors="pt")
|
61 |
|
62 |
with torch.no_grad(): # Disable gradient computation for inference
|
63 |
outputs = model.generate(
|
@@ -93,16 +92,16 @@ example_prompts = [
|
|
93 |
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as iface:
|
94 |
gr.Markdown(
|
95 |
"""
|
96 |
-
# 🤖 Phi-2 QLoRA Chat Interface
|
97 |
|
98 |
-
Chat with the fine-tuned Phi-2 model using QLoRA.
|
99 |
""",
|
100 |
elem_classes="title"
|
101 |
)
|
102 |
|
103 |
gr.Markdown(
|
104 |
"""
|
105 |
-
This interface allows you to interact with a fine-tuned Phi-2 model.
|
106 |
""",
|
107 |
elem_classes="description"
|
108 |
)
|
@@ -123,8 +122,8 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as iface:
|
|
123 |
with gr.Row():
|
124 |
max_length = gr.Slider(
|
125 |
minimum=64,
|
126 |
-
maximum=
|
127 |
-
value=
|
128 |
step=64,
|
129 |
label="Max Length",
|
130 |
info="Maximum length of generated response"
|
@@ -187,7 +186,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as iface:
|
|
187 |
gr.Markdown(
|
188 |
"""
|
189 |
---
|
190 |
-
Made with ❤️ using Phi-2 and QLoRA
|
191 |
""",
|
192 |
elem_classes="footer"
|
193 |
)
|
|
|
1 |
import gradio as gr
|
2 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
3 |
import torch
|
4 |
import random
|
5 |
import time
|
|
|
10 |
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
11 |
model = AutoModelForCausalLM.from_pretrained(
|
12 |
model_path,
|
13 |
+
device_map="cpu", # Force CPU usage
|
14 |
+
torch_dtype=torch.float32, # Use float32 for CPU
|
|
|
15 |
trust_remote_code=True
|
16 |
)
|
17 |
|
|
|
56 |
if not prompt.strip():
|
57 |
return "Please enter a prompt."
|
58 |
|
59 |
+
inputs = tokenizer(prompt, return_tensors="pt")
|
60 |
|
61 |
with torch.no_grad(): # Disable gradient computation for inference
|
62 |
outputs = model.generate(
|
|
|
92 |
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as iface:
|
93 |
gr.Markdown(
|
94 |
"""
|
95 |
+
# 🤖 Phi-2 QLoRA Chat Interface (CPU Version)
|
96 |
|
97 |
+
Chat with the fine-tuned Phi-2 model using QLoRA. This version runs on CPU for better compatibility.
|
98 |
""",
|
99 |
elem_classes="title"
|
100 |
)
|
101 |
|
102 |
gr.Markdown(
|
103 |
"""
|
104 |
+
This interface allows you to interact with a fine-tuned Phi-2 model. Note that responses may be slower due to CPU-only inference.
|
105 |
""",
|
106 |
elem_classes="description"
|
107 |
)
|
|
|
122 |
with gr.Row():
|
123 |
max_length = gr.Slider(
|
124 |
minimum=64,
|
125 |
+
maximum=512, # Reduced max length for CPU
|
126 |
+
value=256, # Reduced default length
|
127 |
step=64,
|
128 |
label="Max Length",
|
129 |
info="Maximum length of generated response"
|
|
|
186 |
gr.Markdown(
|
187 |
"""
|
188 |
---
|
189 |
+
Made with ❤️ using Phi-2 and QLoRA (CPU Version)
|
190 |
""",
|
191 |
elem_classes="footer"
|
192 |
)
|