Spaces:

zamal
/

Molmo-4bit

Running on Zero

App Files Files Community

zamal commited on Oct 2, 2024

Commit

bcb49b1

verified ·

1 Parent(s): 2b7b95f

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -27

app.py CHANGED Viewed

@@ -2,38 +2,49 @@ import gradio as gr
 from transformers import AutoModelForCausalLM, AutoProcessor
 from PIL import Image
 import torch
 # Define the repository for the quantized model
 repo_name = "cyan2k/molmo-7B-D-bnb-4bit"
-arguments = {"device_map": "auto", "torch_dtype": torch.float16, "trust_remote_code": True}
-# Load the processor and quantized model
-processor = AutoProcessor.from_pretrained(repo_name, **arguments)
-model = AutoModelForCausalLM.from_pretrained(repo_name, **arguments)
 def process_image_and_text(image, text):
-    # Process the image and text
-    inputs = processor(
-        images=[Image.fromarray(image)],
-        text=text,
-        return_tensors="pt"
-    )
-    # Move inputs to the same device as the model (GPU)
-    inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    # Generate output
     output = model.generate(**inputs, max_new_tokens=200)
-    # Only get generated tokens; decode them to text
-    generated_text = processor.batch_decode(output, skip_special_tokens=True)[0]
     return generated_text
 def chatbot(image, text, history):
     if image is None:
         return history + [("Please upload an image first.", None)]
     response = process_image_and_text(image, text)
     history.append((text, response))
     return history
@@ -50,16 +61,9 @@ with gr.Blocks() as demo:
     state = gr.State([])
-    submit_button.click(
-        chatbot,
-        inputs=[image_input, text_input, state],
-        outputs=[chatbot_output]
-    )
-    text_input.submit(
-        chatbot,
-        inputs=[image_input, text_input, state],
-        outputs=[chatbot_output]
-    )
-demo.launch()

 from transformers import AutoModelForCausalLM, AutoProcessor
 from PIL import Image
 import torch
+import os
+# Set environment variable to skip CUDA build for flash-attn
+os.environ["FLASH_ATTENTION_SKIP_CUDA_BUILD"] = "TRUE"
 # Define the repository for the quantized model
 repo_name = "cyan2k/molmo-7B-D-bnb-4bit"
+# Load processor and model with GPU optimization
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+processor = AutoProcessor.from_pretrained(repo_name, trust_remote_code=True)
+# Load model with 4-bit quantization
+model = AutoModelForCausalLM.from_pretrained(repo_name,
+                                             device_map="auto",
+                                             torch_dtype=torch.float16,
+                                             load_in_4bit=True,
+                                             trust_remote_code=True)
+model.to(device)
 def process_image_and_text(image, text):
+    # Convert numpy image to PIL format
+    pil_image = Image.fromarray(image)
+    # Process image and text with processor
+    inputs = processor(images=[pil_image], text=text, return_tensors="pt").to(device)
+    # Generate output using the model
     output = model.generate(**inputs, max_new_tokens=200)
+    # Decode the generated output
+    generated_text = processor.decode(output[0], skip_special_tokens=True)
     return generated_text
 def chatbot(image, text, history):
+    # Check if the image is uploaded
     if image is None:
         return history + [("Please upload an image first.", None)]
+    # Get response by processing the image and text
     response = process_image_and_text(image, text)
+    # Append question and response to the chat history
     history.append((text, response))
     return history
     state = gr.State([])
+    # Connect the submit button and textbox to the chatbot function
+    submit_button.click(fn=chatbot, inputs=[image_input, text_input, state], outputs=chatbot_output)
+    text_input.submit(fn=chatbot, inputs=[image_input, text_input, state], outputs=chatbot_output)
+# Launch the Gradio app with GPU
+demo.launch(share=True)