Spaces:

zamal
/

Molmo-4bit

Running on Zero

App Files Files Community

zamal commited on Oct 2, 2024

Commit

0568dda

verified ·

1 Parent(s): efdbe7a

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -36

app.py CHANGED Viewed

@@ -1,46 +1,47 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
 from PIL import Image
 import torch
 import subprocess
-# Run pip install command
-subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 # Define the repository for the quantized model
 repo_name = "cyan2k/molmo-7B-D-bnb-4bit"
-arguments = {"device_map": "auto", "torch_dtype": "auto", "trust_remote_code": True}
-# Load the processor and quantized model
-processor = AutoProcessor.from_pretrained(repo_name, **arguments)
-model = AutoModelForCausalLM.from_pretrained(repo_name, **arguments)
 def process_image_and_text(image, text):
-    # Process the image and text
-    inputs = processor.process(
-        images=[Image.fromarray(image)],
-        text=text
-    )
-    # Move inputs to the same device as the model (GPU) and make a batch of size 1
-    inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
-    # Generate output
-    output = model.generate(
-        **inputs,
-        max_new_tokens=200
-    )
-    # Only get generated tokens; decode them to text
-    generated_text = processor.decode(output, skip_special_tokens=True)
     return generated_text
 def chatbot(image, text, history):
     if image is None:
         return history + [("Please upload an image first.", None)]
     response = process_image_and_text(image, text)
     history.append((text, response))
     return history
@@ -57,16 +58,9 @@ with gr.Blocks() as demo:
     state = gr.State([])
-    submit_button.click(
-        chatbot,
-        inputs=[image_input, text_input, state],
-        outputs=[chatbot_output]
-    )
-    text_input.submit(
-        chatbot,
-        inputs=[image_input, text_input, state],
-        outputs=[chatbot_output]
-    )
-demo.launch()

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoProcessor
 from PIL import Image
 import torch
 import subprocess
+import os
+# Set environment variable to skip CUDA build for flash-attn
+os.environ["FLASH_ATTENTION_SKIP_CUDA_BUILD"] = "TRUE"
+# Install flash-attn package
+subprocess.run('pip install flash-attn --no-build-isolation', shell=True)
 # Define the repository for the quantized model
 repo_name = "cyan2k/molmo-7B-D-bnb-4bit"
+# Load processor and model with GPU optimization
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+processor = AutoProcessor.from_pretrained(repo_name, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(repo_name, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
 def process_image_and_text(image, text):
+    # Convert numpy image to PIL format
+    pil_image = Image.fromarray(image)
+    # Process image and text with processor
+    inputs = processor(images=[pil_image], text=text, return_tensors="pt").to(device)
+    # Generate output using the model
+    output = model.generate(**inputs, max_new_tokens=200)
+    # Decode the generated output
+    generated_text = processor.decode(output[0], skip_special_tokens=True)
     return generated_text
 def chatbot(image, text, history):
+    # Check if the image is uploaded
     if image is None:
         return history + [("Please upload an image first.", None)]
+    # Get response by processing the image and text
     response = process_image_and_text(image, text)
+    # Append question and response to the chat history
     history.append((text, response))
     return history
     state = gr.State([])
+    # Connect the submit button and textbox to the chatbot function
+    submit_button.click(fn=chatbot, inputs=[image_input, text_input, state], outputs=chatbot_output)
+    text_input.submit(fn=chatbot, inputs=[image_input, text_input, state], outputs=chatbot_output)
+# Launch the Gradio app with GPU
+demo.launch(share=True)