KB-VQA / app.py
m7mdal7aj's picture
Update app.py
4fd1ca2 verified
raw
history blame
2.11 kB
import streamlit as st
import torch
import bitsandbytes
import accelerate
import scipy
from PIL import Image
import torch.nn as nn
from transformers import Blip2Processor, Blip2ForConditionalGeneration, InstructBlipProcessor, InstructBlipForConditionalGeneration
def load_caption_model(blip2=False, instructblip=True):
if blip2:
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b", load_in_8bit=True,torch_dtype=torch.float16)
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", load_in_8bit=True,torch_dtype=torch.float16)
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)
model.to('cuda')
#model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16, device_map="auto")
if instructblip:
model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", load_in_8bit=True,torch_dtype=torch.float16)
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)
model.to('cuda')
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b", load_in_8bit=True,torch_dtype=torch.float16)
return model, processor
def answer_question(image, question, model, processor):
image = Image.open(image)
inputs = processor(image, question, return_tensors="pt").to("cuda", torch.float16)
if isinstance(model, torch.nn.DataParallel):
# Use the 'module' attribute to access the original model
out = model.module.generate(**inputs, max_length=100, min_length=20)
else:
out = model.generate(**inputs, max_length=100, min_length=20)
answer = processor.decode(out[0], skip_special_tokens=True).strip()
return answer
st.title("Image Question Answering")
# File uploader for the image
image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
# Text input for the question
question = st.text_input("Enter your question about the image:")