Spaces:

OP7
/

SG_TestSpace

Paused

App Files Files Community

OP7 commited on Jan 27

Commit

4b1adc0

verified ·

1 Parent(s): 648ecd5

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -76

app.py CHANGED Viewed

@@ -1,85 +1,135 @@
-# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-#
-# This space is created by SANJOG GHONGE for testing and learning purpose.
-#
-# If you want to remove this space or credits please contact me on my email id [[email protected]].
-#
-# Citation : @misc{qvq-72b-preview,
-#               title = {QVQ: To See the World with Wisdom},
-#               url = {https://qwenlm.github.io/blog/qvq-72b-preview/},
-#               author = {Qwen Team},
-#               month = {December},
-#               year = {2024}
-#                  }
-#           @article{Qwen2VL,
-#               title={Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution},
-#               author={Wang, Peng and Bai, Shuai and Tan, Sinan and Wang, Shijie and Fan, Zhihao and Bai,
-#               Jinze and Chen, Keqin and Liu, Xuejing and Wang, Jialin and Ge, Wenbin and Fan, Yang and Dang,
-#               Kai and Du, Mengfei and Ren, Xuancheng and Men, Rui and Liu, Dayiheng and Zhou, Chang and Zhou,
-#               Jingren and Lin, Junyang},
-#               journal={arXiv preprint arXiv:2409.12191},
-#               year={2024}
-#                   }
-#
-# -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
-from qwen_vl_utils import process_vision_info
 import gradio as gr
-from PIL import Image
-# Load the model and processor
-model = Qwen2VLForConditionalGeneration.from_pretrained(
-    "Qwen/QVQ-72B-Preview", torch_dtype="auto", device_map="auto"
-)
-processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview")
-# Function to process the image and question
 def process_image_and_question(image, question):
-    if image is None or question.strip() == "":
         return "Please provide both an image and a question."
-    # Prepare the input message
-    messages = [
-        {
-            "role": "system",
-            "content": [
-                {"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."}
-            ],
-        },
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": question},
-            ],
-        }
-    ]
     # Process the inputs
-    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    )
-    inputs = inputs.to("cuda")
     # Generate the output
-    generated_ids = model.generate(**inputs, max_new_tokens=8192)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-    return output_text[0] if output_text else "No output generated."
 # Define the Gradio interface
 with gr.Blocks() as demo:
@@ -103,5 +153,3 @@ with gr.Blocks() as demo:
 # Launch the interface
 demo.launch()

+# # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+# #
+# # This space is created by SANJOG GHONGE for testing and learning purpose.
+# #
+# # If you want to remove this space or credits please contact me on my email id [[email protected]].
+# #
+# # Citation : @misc{qvq-72b-preview,
+# #               title = {QVQ: To See the World with Wisdom},
+# #               url = {https://qwenlm.github.io/blog/qvq-72b-preview/},
+# #               author = {Qwen Team},
+# #               month = {December},
+# #               year = {2024}
+# #                  }
+# #           @article{Qwen2VL,
+# #               title={Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution},
+# #               author={Wang, Peng and Bai, Shuai and Tan, Sinan and Wang, Shijie and Fan, Zhihao and Bai,
+# #               Jinze and Chen, Keqin and Liu, Xuejing and Wang, Jialin and Ge, Wenbin and Fan, Yang and Dang,
+# #               Kai and Du, Mengfei and Ren, Xuancheng and Men, Rui and Liu, Dayiheng and Zhou, Chang and Zhou,
+# #               Jingren and Lin, Junyang},
+# #               journal={arXiv preprint arXiv:2409.12191},
+# #               year={2024}
+# #                   }
+# #
+# # -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+# from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+# from qwen_vl_utils import process_vision_info
+# import gradio as gr
+# from PIL import Image
+# # Load the model and processor
+# model = Qwen2VLForConditionalGeneration.from_pretrained(
+#     "Qwen/QVQ-72B-Preview", torch_dtype="auto", device_map="auto"
+# )
+# processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview")
+# # Function to process the image and question
+# def process_image_and_question(image, question):
+#     if image is None or question.strip() == "":
+#         return "Please provide both an image and a question."
+#     # Prepare the input message
+#     messages = [
+#         {
+#             "role": "system",
+#             "content": [
+#                 {"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."}
+#             ],
+#         },
+#         {
+#             "role": "user",
+#             "content": [
+#                 {"type": "image", "image": image},
+#                 {"type": "text", "text": question},
+#             ],
+#         }
+#     ]
+#     # Process the inputs
+#     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+#     image_inputs, video_inputs = process_vision_info(messages)
+#     inputs = processor(
+#         text=[text],
+#         images=image_inputs,
+#         videos=video_inputs,
+#         padding=True,
+#         return_tensors="pt",
+#     )
+#     inputs = inputs.to("cuda")
+#     # Generate the output
+#     generated_ids = model.generate(**inputs, max_new_tokens=8192)
+#     generated_ids_trimmed = [
+#         out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+#     ]
+#     output_text = processor.batch_decode(
+#         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+#     )
+#     return output_text[0] if output_text else "No output generated."
+# # Define the Gradio interface
+# with gr.Blocks() as demo:
+#     gr.Markdown("# Image and Question Answering\nProvide an image (JPG/PNG) and a related question to get an answer.")
+#     with gr.Row():
+#         with gr.Column():
+#             image_input = gr.Image(type="pil", label="Upload Image (JPG/PNG)")
+#             question_input = gr.Textbox(label="Enter your question")
+#         with gr.Column():
+#             output_box = gr.Textbox(label="Result", interactive=False)
+#     with gr.Row():
+#         clear_button = gr.Button("Clear")
+#         submit_button = gr.Button("Submit")
+#     # Define button functionality
+#     clear_button.click(lambda: (None, "", ""), inputs=[], outputs=[image_input, question_input, output_box])
+#     submit_button.click(process_image_and_question, inputs=[image_input, question_input], outputs=output_box)
+# # Launch the interface
+# demo.launch()
+# ------------------------------------------------------------------------------------------------------------------------------------
 import gradio as gr
+from transformers import AutoProcessor, AutoModelForImageTextToText
+# Load the processor and model
+model_name = "Qwen/QVQ-72B-Preview"
+processor = AutoProcessor.from_pretrained(model_name)
+model = AutoModelForImageTextToText.from_pretrained(model_name)
+# Define the prediction function
 def process_image_and_question(image, question):
+    if image is None or not question:
         return "Please provide both an image and a question."
     # Process the inputs
+    inputs = processor(images=image, text=question, return_tensors="pt")
     # Generate the output
+    outputs = model.generate(**inputs)
+    answer = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+    return answer
 # Define the Gradio interface
 with gr.Blocks() as demo:
 # Launch the interface
 demo.launch()