Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on 27 days ago

Commit

8ac376e

verified ·

1 Parent(s): 8f60151

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -11

app.py CHANGED Viewed

@@ -19,7 +19,6 @@ from qwen_vl_utils import process_vision_info
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Camel-Doc-OCR-062825
@@ -117,6 +116,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             {"type": "text", "text": text},
         ]
     }]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt_full],
@@ -126,10 +126,12 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
@@ -175,6 +177,7 @@ def generate_video(model_name: str, text: str, video_path: str,
         image, timestamp = frame
         messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
         messages[1]["content"].append({"type": "image", "image": image})
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
@@ -184,6 +187,7 @@ def generate_video(model_name: str, text: str, video_path: str,
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
@@ -197,6 +201,7 @@ def generate_video(model_name: str, text: str, video_path: str,
     }
     thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
@@ -208,10 +213,9 @@ def generate_video(model_name: str, text: str, video_path: str,
 image_examples = [
     ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
     ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
-    ["explain the movie shot in detail.", "images/3.png"],
     ["fill the correct numbers.", "images/4.png"]
 ]
 video_examples = [
     ["explain the ad video in detail.", "videos/1.mp4"],
     ["explain the video in detail.", "videos/2.mp4"]
@@ -231,10 +235,96 @@ css = """
     border-radius: 10px;
     padding: 20px;
 }
 """
 # Create the Gradio Interface
-with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     gr.Markdown("# **[Multimodal OCR Comparator](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
@@ -255,30 +345,24 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                         examples=video_examples,
                         inputs=[video_query, video_upload]
                     )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
                 output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
-                with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.md)")
             model_choice = gr.Radio(
                 choices=["Camel-Doc-OCR-062825", "MonkeyOCR-pro-1.2B", "Megalodon-OCR-Sync-0713", "Qwen2-VL-OCR-2B"],
                 label="Select Model",
                 value="Camel-Doc-OCR-062825"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
     # Define the submit button actions
     image_submit.click(fn=generate_image,
                        inputs=[

 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Camel-Doc-OCR-062825
             {"type": "text", "text": text},
         ]
     }]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt_full],
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         image, timestamp = frame
         messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
         messages[1]["content"].append({"type": "image", "image": image})
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
     }
     thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
 image_examples = [
     ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
     ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
+    ["explain the movie shot in detail.", "images/3.png"],
     ["fill the correct numbers.", "images/4.png"]
 ]
 video_examples = [
     ["explain the ad video in detail.", "videos/1.mp4"],
     ["explain the video in detail.", "videos/2.mp4"]
     border-radius: 10px;
     padding: 20px;
 }
+/* From Uiverse.io by Subaashbala */
+button {
+  display: flex;
+  justify-content: space-around;
+  align-items: center;
+  padding: 1em 0em 1em 1em;
+  background-color: yellow;
+  cursor: pointer;
+  box-shadow: 4px 6px 0px black;
+  border: 4px solid;
+  border-radius: 15px;
+  position: relative;
+  overflow: hidden;
+  z-index: 100;
+  transition: box-shadow 250ms, transform 250ms, filter 50ms;
+}
+button:hover {
+  transform: translate(2px, 2px);
+  box-shadow: 2px 3px 0px black;
+}
+button:active {
+  filter: saturate(0.75);
+}
+button::after {
+  content: "";
+  position: absolute;
+  inset: 0;
+  background-color: pink;
+  z-index: -1;
+  transform: translateX(-100%);
+  transition: transform 250ms;
+}
+button:hover::after {
+  transform: translateX(0);
+}
+.bgContainer {
+  position: relative;
+  display: flex;
+  justify-content: start;
+  align-items: center;
+  overflow: hidden;
+  max-width: 35%; /* adjust this if the button text is not proper */
+  font-size: 2em;
+  font-weight: 600;
+}
+.bgContainer span {
+  position: relative;
+  transform: translateX(-100%);
+  transition: all 250ms;
+}
+.button:hover .bgContainer > span {
+  transform: translateX(0);
+}
+.arrowContainer {
+  padding: 1em;
+  margin-inline-end: 1em;
+  border: 4px solid;
+  border-radius: 50%;
+  background-color: pink;
+  position: relative;
+  overflow: hidden;
+  transition: transform 250ms, background-color 250ms;
+  z-index: 100;
+}
+.arrowContainer::after {
+  content: "";
+  position: absolute;
+  inset: 0;
+  border-radius: inherit;
+  background-color: yellow;
+  transform: translateX(-100%);
+  z-index: -1;
+  transition: transform 250ms ease-in-out;
+}
+button:hover .arrowContainer::after {
+  transform: translateX(0);
+}
+button:hover .arrowContainer {
+  transform: translateX(5px);
+}
+button:active .arrowContainer {
+  transform: translateX(8px);
+}
+.arrowContainer svg {
+  vertical-align: middle;
+}
 """
 # Create the Gradio Interface
+with gr.Blocks(css=css) as demo:
     gr.Markdown("# **[Multimodal OCR Comparator](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
                         examples=video_examples,
                         inputs=[video_query, video_upload]
                     )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
                 output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
+                with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.md)")
             model_choice = gr.Radio(
                 choices=["Camel-Doc-OCR-062825", "MonkeyOCR-pro-1.2B", "Megalodon-OCR-Sync-0713", "Qwen2-VL-OCR-2B"],
                 label="Select Model",
                 value="Camel-Doc-OCR-062825"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
     # Define the submit button actions
     image_submit.click(fn=generate_image,
                        inputs=[