File size: 4,751 Bytes
699b814
80eb147
699b814
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80eb147
699b814
 
 
80eb147
7c4d1e2
b09dd05
 
eae378d
 
 
 
b09dd05
699b814
 
 
7c4d1e2
b09dd05
91d6fed
699b814
 
 
 
 
 
 
 
 
 
 
 
 
3031c51
 
699b814
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c4d1e2
699b814
 
 
 
80eb147
699b814
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c4d1e2
699b814
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
#
# This space is created by SANJOG GHONGE for testing and learning purpose.
#
# If you want to remove this space or credits please contact me on my email id [[email protected]].
#
# Citation : @misc{qvq-72b-preview,
#               title = {QVQ: To See the World with Wisdom},
#               url = {https://qwenlm.github.io/blog/qvq-72b-preview/},
#               author = {Qwen Team},
#               month = {December},
#               year = {2024}
#                  }

#           @article{Qwen2VL,
#               title={Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution},
#               author={Wang, Peng and Bai, Shuai and Tan, Sinan and Wang, Shijie and Fan, Zhihao and Bai, 
#               Jinze and Chen, Keqin and Liu, Xuejing and Wang, Jialin and Ge, Wenbin and Fan, Yang and Dang, 
#               Kai and Du, Mengfei and Ren, Xuancheng and Men, Rui and Liu, Dayiheng and Zhou, Chang and Zhou, 
#               Jingren and Lin, Junyang},
#               journal={arXiv preprint arXiv:2409.12191},
#               year={2024}
#                   }
#
# -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------

from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info
import gradio as gr
from PIL import Image
import torch

# Create a configuration for quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,   
    bnb_4bit_compute_dtype="float16",  
    bnb_4bit_use_double_quant=True,    
    bnb_4bit_quant_type="nf4",         
)

# Load the model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/QVQ-72B-Preview", device_map="auto", 
     quantization_config=quantization_config,
     offload_folder="offload",
)
processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview")

# Function to process the image and question
def process_image_and_question(image, question):
    if image is None or question.strip() == "":
        return "Please provide both an image and a question."

    # Prepare the input message
    messages = [
        {
            "role": "system",
            "content": [
                # {"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."}
                {"type": "text", "text": "You are helpful assistant, you give answer in JSON"}
            ],
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": question},
            ],
        }
    ]

    # Process the inputs
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # Generate the output
    generated_ids = model.generate(**inputs, max_new_tokens=8192)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    print(output_text[0] if output_text else "No output generated.")
    return output_text[0] if output_text else "No output generated."

# Define the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Sanjog Test : Image and Question Answering\nProvide an image (JPG/PNG) and a related question to get an answer.")

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload Image (JPG/PNG)")
            question_input = gr.Textbox(label="Enter your question")

        with gr.Column():
            output_box = gr.Textbox(label="Result", interactive=False)

    with gr.Row():
        clear_button = gr.Button("Clear")
        submit_button = gr.Button("Submit")

    # Define button functionality
    clear_button.click(lambda: (None, "", ""), inputs=[], outputs=[image_input, question_input, output_box])
    submit_button.click(process_image_and_question, inputs=[image_input, question_input], outputs=output_box)

# Launch the interface
demo.launch()

print(torch.cuda.memory_summary())