OP7 commited on
Commit
4b1adc0
·
verified ·
1 Parent(s): 648ecd5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -76
app.py CHANGED
@@ -1,85 +1,135 @@
1
- # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
2
- #
3
- # This space is created by SANJOG GHONGE for testing and learning purpose.
4
- #
5
- # If you want to remove this space or credits please contact me on my email id [[email protected]].
6
- #
7
- # Citation : @misc{qvq-72b-preview,
8
- # title = {QVQ: To See the World with Wisdom},
9
- # url = {https://qwenlm.github.io/blog/qvq-72b-preview/},
10
- # author = {Qwen Team},
11
- # month = {December},
12
- # year = {2024}
13
- # }
14
-
15
- # @article{Qwen2VL,
16
- # title={Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution},
17
- # author={Wang, Peng and Bai, Shuai and Tan, Sinan and Wang, Shijie and Fan, Zhihao and Bai,
18
- # Jinze and Chen, Keqin and Liu, Xuejing and Wang, Jialin and Ge, Wenbin and Fan, Yang and Dang,
19
- # Kai and Du, Mengfei and Ren, Xuancheng and Men, Rui and Liu, Dayiheng and Zhou, Chang and Zhou,
20
- # Jingren and Lin, Junyang},
21
- # journal={arXiv preprint arXiv:2409.12191},
22
- # year={2024}
23
- # }
24
- #
25
- # -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
-
27
- from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
28
- from qwen_vl_utils import process_vision_info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  import gradio as gr
30
- from PIL import Image
31
 
32
- # Load the model and processor
33
- model = Qwen2VLForConditionalGeneration.from_pretrained(
34
- "Qwen/QVQ-72B-Preview", torch_dtype="auto", device_map="auto"
35
- )
36
- processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview")
37
 
38
- # Function to process the image and question
39
  def process_image_and_question(image, question):
40
- if image is None or question.strip() == "":
41
  return "Please provide both an image and a question."
42
-
43
- # Prepare the input message
44
- messages = [
45
- {
46
- "role": "system",
47
- "content": [
48
- {"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."}
49
- ],
50
- },
51
- {
52
- "role": "user",
53
- "content": [
54
- {"type": "image", "image": image},
55
- {"type": "text", "text": question},
56
- ],
57
- }
58
- ]
59
-
60
  # Process the inputs
61
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
62
- image_inputs, video_inputs = process_vision_info(messages)
63
-
64
- inputs = processor(
65
- text=[text],
66
- images=image_inputs,
67
- videos=video_inputs,
68
- padding=True,
69
- return_tensors="pt",
70
- )
71
- inputs = inputs.to("cuda")
72
-
73
  # Generate the output
74
- generated_ids = model.generate(**inputs, max_new_tokens=8192)
75
- generated_ids_trimmed = [
76
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
77
- ]
78
- output_text = processor.batch_decode(
79
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
80
- )
81
 
82
- return output_text[0] if output_text else "No output generated."
83
 
84
  # Define the Gradio interface
85
  with gr.Blocks() as demo:
@@ -103,5 +153,3 @@ with gr.Blocks() as demo:
103
 
104
  # Launch the interface
105
  demo.launch()
106
-
107
-
 
1
+ # # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
2
+ # #
3
+ # # This space is created by SANJOG GHONGE for testing and learning purpose.
4
+ # #
5
+ # # If you want to remove this space or credits please contact me on my email id [[email protected]].
6
+ # #
7
+ # # Citation : @misc{qvq-72b-preview,
8
+ # # title = {QVQ: To See the World with Wisdom},
9
+ # # url = {https://qwenlm.github.io/blog/qvq-72b-preview/},
10
+ # # author = {Qwen Team},
11
+ # # month = {December},
12
+ # # year = {2024}
13
+ # # }
14
+
15
+ # # @article{Qwen2VL,
16
+ # # title={Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution},
17
+ # # author={Wang, Peng and Bai, Shuai and Tan, Sinan and Wang, Shijie and Fan, Zhihao and Bai,
18
+ # # Jinze and Chen, Keqin and Liu, Xuejing and Wang, Jialin and Ge, Wenbin and Fan, Yang and Dang,
19
+ # # Kai and Du, Mengfei and Ren, Xuancheng and Men, Rui and Liu, Dayiheng and Zhou, Chang and Zhou,
20
+ # # Jingren and Lin, Junyang},
21
+ # # journal={arXiv preprint arXiv:2409.12191},
22
+ # # year={2024}
23
+ # # }
24
+ # #
25
+ # # -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+
27
+ # from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
28
+ # from qwen_vl_utils import process_vision_info
29
+ # import gradio as gr
30
+ # from PIL import Image
31
+
32
+ # # Load the model and processor
33
+ # model = Qwen2VLForConditionalGeneration.from_pretrained(
34
+ # "Qwen/QVQ-72B-Preview", torch_dtype="auto", device_map="auto"
35
+ # )
36
+ # processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview")
37
+
38
+ # # Function to process the image and question
39
+ # def process_image_and_question(image, question):
40
+ # if image is None or question.strip() == "":
41
+ # return "Please provide both an image and a question."
42
+
43
+ # # Prepare the input message
44
+ # messages = [
45
+ # {
46
+ # "role": "system",
47
+ # "content": [
48
+ # {"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."}
49
+ # ],
50
+ # },
51
+ # {
52
+ # "role": "user",
53
+ # "content": [
54
+ # {"type": "image", "image": image},
55
+ # {"type": "text", "text": question},
56
+ # ],
57
+ # }
58
+ # ]
59
+
60
+ # # Process the inputs
61
+ # text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
62
+ # image_inputs, video_inputs = process_vision_info(messages)
63
+
64
+ # inputs = processor(
65
+ # text=[text],
66
+ # images=image_inputs,
67
+ # videos=video_inputs,
68
+ # padding=True,
69
+ # return_tensors="pt",
70
+ # )
71
+ # inputs = inputs.to("cuda")
72
+
73
+ # # Generate the output
74
+ # generated_ids = model.generate(**inputs, max_new_tokens=8192)
75
+ # generated_ids_trimmed = [
76
+ # out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
77
+ # ]
78
+ # output_text = processor.batch_decode(
79
+ # generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
80
+ # )
81
+
82
+ # return output_text[0] if output_text else "No output generated."
83
+
84
+ # # Define the Gradio interface
85
+ # with gr.Blocks() as demo:
86
+ # gr.Markdown("# Image and Question Answering\nProvide an image (JPG/PNG) and a related question to get an answer.")
87
+
88
+ # with gr.Row():
89
+ # with gr.Column():
90
+ # image_input = gr.Image(type="pil", label="Upload Image (JPG/PNG)")
91
+ # question_input = gr.Textbox(label="Enter your question")
92
+
93
+ # with gr.Column():
94
+ # output_box = gr.Textbox(label="Result", interactive=False)
95
+
96
+ # with gr.Row():
97
+ # clear_button = gr.Button("Clear")
98
+ # submit_button = gr.Button("Submit")
99
+
100
+ # # Define button functionality
101
+ # clear_button.click(lambda: (None, "", ""), inputs=[], outputs=[image_input, question_input, output_box])
102
+ # submit_button.click(process_image_and_question, inputs=[image_input, question_input], outputs=output_box)
103
+
104
+ # # Launch the interface
105
+ # demo.launch()
106
+
107
+
108
+ # ------------------------------------------------------------------------------------------------------------------------------------
109
+
110
+
111
+
112
  import gradio as gr
113
+ from transformers import AutoProcessor, AutoModelForImageTextToText
114
 
115
+ # Load the processor and model
116
+ model_name = "Qwen/QVQ-72B-Preview"
117
+ processor = AutoProcessor.from_pretrained(model_name)
118
+ model = AutoModelForImageTextToText.from_pretrained(model_name)
 
119
 
120
+ # Define the prediction function
121
  def process_image_and_question(image, question):
122
+ if image is None or not question:
123
  return "Please provide both an image and a question."
124
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  # Process the inputs
126
+ inputs = processor(images=image, text=question, return_tensors="pt")
127
+
 
 
 
 
 
 
 
 
 
 
128
  # Generate the output
129
+ outputs = model.generate(**inputs)
130
+ answer = processor.batch_decode(outputs, skip_special_tokens=True)[0]
 
 
 
 
 
131
 
132
+ return answer
133
 
134
  # Define the Gradio interface
135
  with gr.Blocks() as demo:
 
153
 
154
  # Launch the interface
155
  demo.launch()