prithivMLmods commited on
Commit
8ac376e
·
verified ·
1 Parent(s): 8f60151

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -11
app.py CHANGED
@@ -19,7 +19,6 @@ from qwen_vl_utils import process_vision_info
19
  MAX_MAX_NEW_TOKENS = 2048
20
  DEFAULT_MAX_NEW_TOKENS = 1024
21
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
22
-
23
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
24
 
25
  # Load Camel-Doc-OCR-062825
@@ -117,6 +116,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
117
  {"type": "text", "text": text},
118
  ]
119
  }]
 
120
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
121
  inputs = processor(
122
  text=[prompt_full],
@@ -126,10 +126,12 @@ def generate_image(model_name: str, text: str, image: Image.Image,
126
  truncation=False,
127
  max_length=MAX_INPUT_TOKEN_LENGTH
128
  ).to(device)
 
129
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
130
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
131
  thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
132
  thread.start()
 
133
  buffer = ""
134
  for new_text in streamer:
135
  buffer += new_text
@@ -175,6 +177,7 @@ def generate_video(model_name: str, text: str, video_path: str,
175
  image, timestamp = frame
176
  messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
177
  messages[1]["content"].append({"type": "image", "image": image})
 
178
  inputs = processor.apply_chat_template(
179
  messages,
180
  tokenize=True,
@@ -184,6 +187,7 @@ def generate_video(model_name: str, text: str, video_path: str,
184
  truncation=False,
185
  max_length=MAX_INPUT_TOKEN_LENGTH
186
  ).to(device)
 
187
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
188
  generation_kwargs = {
189
  **inputs,
@@ -197,6 +201,7 @@ def generate_video(model_name: str, text: str, video_path: str,
197
  }
198
  thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
199
  thread.start()
 
200
  buffer = ""
201
  for new_text in streamer:
202
  buffer += new_text
@@ -208,10 +213,9 @@ def generate_video(model_name: str, text: str, video_path: str,
208
  image_examples = [
209
  ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
210
  ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
211
- ["explain the movie shot in detail.", "images/3.png"],
212
  ["fill the correct numbers.", "images/4.png"]
213
  ]
214
-
215
  video_examples = [
216
  ["explain the ad video in detail.", "videos/1.mp4"],
217
  ["explain the video in detail.", "videos/2.mp4"]
@@ -231,10 +235,96 @@ css = """
231
  border-radius: 10px;
232
  padding: 20px;
233
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  """
235
 
236
  # Create the Gradio Interface
237
- with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
238
  gr.Markdown("# **[Multimodal OCR Comparator](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
239
  with gr.Row():
240
  with gr.Column():
@@ -255,30 +345,24 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
255
  examples=video_examples,
256
  inputs=[video_query, video_upload]
257
  )
258
-
259
  with gr.Accordion("Advanced options", open=False):
260
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
261
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
262
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
263
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
264
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
265
-
266
  with gr.Column():
267
  with gr.Column(elem_classes="canvas-output"):
268
  gr.Markdown("## Output")
269
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
270
-
271
- with gr.Accordion("(Result.md)", open=False):
272
  markdown_output = gr.Markdown(label="(Result.md)")
273
-
274
  model_choice = gr.Radio(
275
  choices=["Camel-Doc-OCR-062825", "MonkeyOCR-pro-1.2B", "Megalodon-OCR-Sync-0713", "Qwen2-VL-OCR-2B"],
276
  label="Select Model",
277
  value="Camel-Doc-OCR-062825"
278
  )
279
-
280
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
281
-
282
  # Define the submit button actions
283
  image_submit.click(fn=generate_image,
284
  inputs=[
 
19
  MAX_MAX_NEW_TOKENS = 2048
20
  DEFAULT_MAX_NEW_TOKENS = 1024
21
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 
22
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
23
 
24
  # Load Camel-Doc-OCR-062825
 
116
  {"type": "text", "text": text},
117
  ]
118
  }]
119
+
120
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
121
  inputs = processor(
122
  text=[prompt_full],
 
126
  truncation=False,
127
  max_length=MAX_INPUT_TOKEN_LENGTH
128
  ).to(device)
129
+
130
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
131
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
132
  thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
133
  thread.start()
134
+
135
  buffer = ""
136
  for new_text in streamer:
137
  buffer += new_text
 
177
  image, timestamp = frame
178
  messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
179
  messages[1]["content"].append({"type": "image", "image": image})
180
+
181
  inputs = processor.apply_chat_template(
182
  messages,
183
  tokenize=True,
 
187
  truncation=False,
188
  max_length=MAX_INPUT_TOKEN_LENGTH
189
  ).to(device)
190
+
191
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
192
  generation_kwargs = {
193
  **inputs,
 
201
  }
202
  thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
203
  thread.start()
204
+
205
  buffer = ""
206
  for new_text in streamer:
207
  buffer += new_text
 
213
  image_examples = [
214
  ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
215
  ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
216
+ ["explain the movie shot in detail.", "images/3.png"],
217
  ["fill the correct numbers.", "images/4.png"]
218
  ]
 
219
  video_examples = [
220
  ["explain the ad video in detail.", "videos/1.mp4"],
221
  ["explain the video in detail.", "videos/2.mp4"]
 
235
  border-radius: 10px;
236
  padding: 20px;
237
  }
238
+ /* From Uiverse.io by Subaashbala */
239
+ button {
240
+ display: flex;
241
+ justify-content: space-around;
242
+ align-items: center;
243
+ padding: 1em 0em 1em 1em;
244
+ background-color: yellow;
245
+ cursor: pointer;
246
+ box-shadow: 4px 6px 0px black;
247
+ border: 4px solid;
248
+ border-radius: 15px;
249
+ position: relative;
250
+ overflow: hidden;
251
+ z-index: 100;
252
+ transition: box-shadow 250ms, transform 250ms, filter 50ms;
253
+ }
254
+ button:hover {
255
+ transform: translate(2px, 2px);
256
+ box-shadow: 2px 3px 0px black;
257
+ }
258
+ button:active {
259
+ filter: saturate(0.75);
260
+ }
261
+ button::after {
262
+ content: "";
263
+ position: absolute;
264
+ inset: 0;
265
+ background-color: pink;
266
+ z-index: -1;
267
+ transform: translateX(-100%);
268
+ transition: transform 250ms;
269
+ }
270
+ button:hover::after {
271
+ transform: translateX(0);
272
+ }
273
+ .bgContainer {
274
+ position: relative;
275
+ display: flex;
276
+ justify-content: start;
277
+ align-items: center;
278
+ overflow: hidden;
279
+ max-width: 35%; /* adjust this if the button text is not proper */
280
+ font-size: 2em;
281
+ font-weight: 600;
282
+ }
283
+ .bgContainer span {
284
+ position: relative;
285
+ transform: translateX(-100%);
286
+ transition: all 250ms;
287
+ }
288
+ .button:hover .bgContainer > span {
289
+ transform: translateX(0);
290
+ }
291
+ .arrowContainer {
292
+ padding: 1em;
293
+ margin-inline-end: 1em;
294
+ border: 4px solid;
295
+ border-radius: 50%;
296
+ background-color: pink;
297
+ position: relative;
298
+ overflow: hidden;
299
+ transition: transform 250ms, background-color 250ms;
300
+ z-index: 100;
301
+ }
302
+ .arrowContainer::after {
303
+ content: "";
304
+ position: absolute;
305
+ inset: 0;
306
+ border-radius: inherit;
307
+ background-color: yellow;
308
+ transform: translateX(-100%);
309
+ z-index: -1;
310
+ transition: transform 250ms ease-in-out;
311
+ }
312
+ button:hover .arrowContainer::after {
313
+ transform: translateX(0);
314
+ }
315
+ button:hover .arrowContainer {
316
+ transform: translateX(5px);
317
+ }
318
+ button:active .arrowContainer {
319
+ transform: translateX(8px);
320
+ }
321
+ .arrowContainer svg {
322
+ vertical-align: middle;
323
+ }
324
  """
325
 
326
  # Create the Gradio Interface
327
+ with gr.Blocks(css=css) as demo:
328
  gr.Markdown("# **[Multimodal OCR Comparator](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
329
  with gr.Row():
330
  with gr.Column():
 
345
  examples=video_examples,
346
  inputs=[video_query, video_upload]
347
  )
 
348
  with gr.Accordion("Advanced options", open=False):
349
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
350
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
351
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
352
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
353
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
 
354
  with gr.Column():
355
  with gr.Column(elem_classes="canvas-output"):
356
  gr.Markdown("## Output")
357
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
358
+ with gr.Accordion("(Result.md)", open=False):
 
359
  markdown_output = gr.Markdown(label="(Result.md)")
 
360
  model_choice = gr.Radio(
361
  choices=["Camel-Doc-OCR-062825", "MonkeyOCR-pro-1.2B", "Megalodon-OCR-Sync-0713", "Qwen2-VL-OCR-2B"],
362
  label="Select Model",
363
  value="Camel-Doc-OCR-062825"
364
  )
 
365
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
 
366
  # Define the submit button actions
367
  image_submit.click(fn=generate_image,
368
  inputs=[