padmanabhbosamia commited on
Commit
2eb6820
·
verified ·
1 Parent(s): 14f2b83

Upload app.py

Browse files

Updated cuda error

Files changed (1) hide show
  1. app.py +10 -11
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
3
  import torch
4
  import random
5
  import time
@@ -10,9 +10,8 @@ model_path = "./phi2-qlora-final"
10
  tokenizer = AutoTokenizer.from_pretrained(model_path)
11
  model = AutoModelForCausalLM.from_pretrained(
12
  model_path,
13
- device_map="auto",
14
- load_in_8bit=True, # Use 8-bit quantization instead of 4-bit
15
- torch_dtype=torch.float16,
16
  trust_remote_code=True
17
  )
18
 
@@ -57,7 +56,7 @@ def generate_response(prompt, max_length=512, temperature=0.7, top_p=0.9, top_k=
57
  if not prompt.strip():
58
  return "Please enter a prompt."
59
 
60
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
61
 
62
  with torch.no_grad(): # Disable gradient computation for inference
63
  outputs = model.generate(
@@ -93,16 +92,16 @@ example_prompts = [
93
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as iface:
94
  gr.Markdown(
95
  """
96
- # 🤖 Phi-2 QLoRA Chat Interface
97
 
98
- Chat with the fine-tuned Phi-2 model using QLoRA. Adjust the parameters below to control the generation.
99
  """,
100
  elem_classes="title"
101
  )
102
 
103
  gr.Markdown(
104
  """
105
- This interface allows you to interact with a fine-tuned Phi-2 model. You can adjust various parameters to control the generation process.
106
  """,
107
  elem_classes="description"
108
  )
@@ -123,8 +122,8 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as iface:
123
  with gr.Row():
124
  max_length = gr.Slider(
125
  minimum=64,
126
- maximum=1024,
127
- value=512,
128
  step=64,
129
  label="Max Length",
130
  info="Maximum length of generated response"
@@ -187,7 +186,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as iface:
187
  gr.Markdown(
188
  """
189
  ---
190
- Made with ❤️ using Phi-2 and QLoRA
191
  """,
192
  elem_classes="footer"
193
  )
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
  import random
5
  import time
 
10
  tokenizer = AutoTokenizer.from_pretrained(model_path)
11
  model = AutoModelForCausalLM.from_pretrained(
12
  model_path,
13
+ device_map="cpu", # Force CPU usage
14
+ torch_dtype=torch.float32, # Use float32 for CPU
 
15
  trust_remote_code=True
16
  )
17
 
 
56
  if not prompt.strip():
57
  return "Please enter a prompt."
58
 
59
+ inputs = tokenizer(prompt, return_tensors="pt")
60
 
61
  with torch.no_grad(): # Disable gradient computation for inference
62
  outputs = model.generate(
 
92
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as iface:
93
  gr.Markdown(
94
  """
95
+ # 🤖 Phi-2 QLoRA Chat Interface (CPU Version)
96
 
97
+ Chat with the fine-tuned Phi-2 model using QLoRA. This version runs on CPU for better compatibility.
98
  """,
99
  elem_classes="title"
100
  )
101
 
102
  gr.Markdown(
103
  """
104
+ This interface allows you to interact with a fine-tuned Phi-2 model. Note that responses may be slower due to CPU-only inference.
105
  """,
106
  elem_classes="description"
107
  )
 
122
  with gr.Row():
123
  max_length = gr.Slider(
124
  minimum=64,
125
+ maximum=512, # Reduced max length for CPU
126
+ value=256, # Reduced default length
127
  step=64,
128
  label="Max Length",
129
  info="Maximum length of generated response"
 
186
  gr.Markdown(
187
  """
188
  ---
189
+ Made with ❤️ using Phi-2 and QLoRA (CPU Version)
190
  """,
191
  elem_classes="footer"
192
  )