OP7 commited on
Commit
7c4d1e2
·
verified ·
1 Parent(s): 80eb147

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -7
app.py CHANGED
@@ -29,20 +29,22 @@ from qwen_vl_utils import process_vision_info
29
  import gradio as gr
30
  from PIL import Image
31
  import torch
32
- print(torch.cuda.memory_summary())
33
 
34
 
35
  # Create a configuration for quantization
36
  quantization_config = BitsAndBytesConfig(
37
- load_in_4bit=True, # Set to True for 4-bit quantization
38
- bnb_4bit_compute_dtype="float16", # Use float16 for faster computations
39
- bnb_4bit_use_double_quant=True, # Optional: Double quantization for memory savings
40
- bnb_4bit_quant_type="nf4", # NormalFloat4 (nf4) is better for performance
41
  )
42
 
43
  # Load the model and processor
44
  model = Qwen2VLForConditionalGeneration.from_pretrained(
45
- "Qwen/QVQ-72B-Preview", device_map="auto", quantization_config=quantization_config,
 
 
46
  )
47
  processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview")
48
 
@@ -89,7 +91,7 @@ def process_image_and_question(image, question):
89
  output_text = processor.batch_decode(
90
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
91
  )
92
-
93
  return output_text[0] if output_text else "No output generated."
94
 
95
  # Define the Gradio interface
@@ -115,4 +117,5 @@ with gr.Blocks() as demo:
115
  # Launch the interface
116
  demo.launch()
117
 
 
118
 
 
29
  import gradio as gr
30
  from PIL import Image
31
  import torch
32
+
33
 
34
 
35
  # Create a configuration for quantization
36
  quantization_config = BitsAndBytesConfig(
37
+ load_in_4bit=True,
38
+ bnb_4bit_compute_dtype="float16",
39
+ bnb_4bit_use_double_quant=True,
40
+ bnb_4bit_quant_type="nf4",
41
  )
42
 
43
  # Load the model and processor
44
  model = Qwen2VLForConditionalGeneration.from_pretrained(
45
+ "Qwen/QVQ-72B-Preview", device_map="auto",
46
+ quantization_config=quantization_config,
47
+ offload_folder="offload",
48
  )
49
  processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview")
50
 
 
91
  output_text = processor.batch_decode(
92
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
93
  )
94
+ print(output_text[0] if output_text else "No output generated.")
95
  return output_text[0] if output_text else "No output generated."
96
 
97
  # Define the Gradio interface
 
117
  # Launch the interface
118
  demo.launch()
119
 
120
+ print(torch.cuda.memory_summary())
121