prithivMLmods commited on
Commit
b6e3398
·
verified ·
1 Parent(s): 8ac376e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -95
app.py CHANGED
@@ -19,6 +19,7 @@ from qwen_vl_utils import process_vision_info
19
  MAX_MAX_NEW_TOKENS = 2048
20
  DEFAULT_MAX_NEW_TOKENS = 1024
21
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 
22
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
23
 
24
  # Load Camel-Doc-OCR-062825
@@ -116,7 +117,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
116
  {"type": "text", "text": text},
117
  ]
118
  }]
119
-
120
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
121
  inputs = processor(
122
  text=[prompt_full],
@@ -126,12 +126,10 @@ def generate_image(model_name: str, text: str, image: Image.Image,
126
  truncation=False,
127
  max_length=MAX_INPUT_TOKEN_LENGTH
128
  ).to(device)
129
-
130
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
131
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
132
  thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
133
  thread.start()
134
-
135
  buffer = ""
136
  for new_text in streamer:
137
  buffer += new_text
@@ -177,7 +175,6 @@ def generate_video(model_name: str, text: str, video_path: str,
177
  image, timestamp = frame
178
  messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
179
  messages[1]["content"].append({"type": "image", "image": image})
180
-
181
  inputs = processor.apply_chat_template(
182
  messages,
183
  tokenize=True,
@@ -187,7 +184,6 @@ def generate_video(model_name: str, text: str, video_path: str,
187
  truncation=False,
188
  max_length=MAX_INPUT_TOKEN_LENGTH
189
  ).to(device)
190
-
191
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
192
  generation_kwargs = {
193
  **inputs,
@@ -201,7 +197,6 @@ def generate_video(model_name: str, text: str, video_path: str,
201
  }
202
  thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
203
  thread.start()
204
-
205
  buffer = ""
206
  for new_text in streamer:
207
  buffer += new_text
@@ -213,9 +208,10 @@ def generate_video(model_name: str, text: str, video_path: str,
213
  image_examples = [
214
  ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
215
  ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
216
- ["explain the movie shot in detail.", "images/3.png"],
217
  ["fill the correct numbers.", "images/4.png"]
218
  ]
 
219
  video_examples = [
220
  ["explain the ad video in detail.", "videos/1.mp4"],
221
  ["explain the video in detail.", "videos/2.mp4"]
@@ -235,96 +231,10 @@ css = """
235
  border-radius: 10px;
236
  padding: 20px;
237
  }
238
- /* From Uiverse.io by Subaashbala */
239
- button {
240
- display: flex;
241
- justify-content: space-around;
242
- align-items: center;
243
- padding: 1em 0em 1em 1em;
244
- background-color: yellow;
245
- cursor: pointer;
246
- box-shadow: 4px 6px 0px black;
247
- border: 4px solid;
248
- border-radius: 15px;
249
- position: relative;
250
- overflow: hidden;
251
- z-index: 100;
252
- transition: box-shadow 250ms, transform 250ms, filter 50ms;
253
- }
254
- button:hover {
255
- transform: translate(2px, 2px);
256
- box-shadow: 2px 3px 0px black;
257
- }
258
- button:active {
259
- filter: saturate(0.75);
260
- }
261
- button::after {
262
- content: "";
263
- position: absolute;
264
- inset: 0;
265
- background-color: pink;
266
- z-index: -1;
267
- transform: translateX(-100%);
268
- transition: transform 250ms;
269
- }
270
- button:hover::after {
271
- transform: translateX(0);
272
- }
273
- .bgContainer {
274
- position: relative;
275
- display: flex;
276
- justify-content: start;
277
- align-items: center;
278
- overflow: hidden;
279
- max-width: 35%; /* adjust this if the button text is not proper */
280
- font-size: 2em;
281
- font-weight: 600;
282
- }
283
- .bgContainer span {
284
- position: relative;
285
- transform: translateX(-100%);
286
- transition: all 250ms;
287
- }
288
- .button:hover .bgContainer > span {
289
- transform: translateX(0);
290
- }
291
- .arrowContainer {
292
- padding: 1em;
293
- margin-inline-end: 1em;
294
- border: 4px solid;
295
- border-radius: 50%;
296
- background-color: pink;
297
- position: relative;
298
- overflow: hidden;
299
- transition: transform 250ms, background-color 250ms;
300
- z-index: 100;
301
- }
302
- .arrowContainer::after {
303
- content: "";
304
- position: absolute;
305
- inset: 0;
306
- border-radius: inherit;
307
- background-color: yellow;
308
- transform: translateX(-100%);
309
- z-index: -1;
310
- transition: transform 250ms ease-in-out;
311
- }
312
- button:hover .arrowContainer::after {
313
- transform: translateX(0);
314
- }
315
- button:hover .arrowContainer {
316
- transform: translateX(5px);
317
- }
318
- button:active .arrowContainer {
319
- transform: translateX(8px);
320
- }
321
- .arrowContainer svg {
322
- vertical-align: middle;
323
- }
324
  """
325
 
326
  # Create the Gradio Interface
327
- with gr.Blocks(css=css) as demo:
328
  gr.Markdown("# **[Multimodal OCR Comparator](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
329
  with gr.Row():
330
  with gr.Column():
@@ -345,24 +255,30 @@ with gr.Blocks(css=css) as demo:
345
  examples=video_examples,
346
  inputs=[video_query, video_upload]
347
  )
 
348
  with gr.Accordion("Advanced options", open=False):
349
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
350
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
351
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
352
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
353
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
 
354
  with gr.Column():
355
  with gr.Column(elem_classes="canvas-output"):
356
  gr.Markdown("## Output")
357
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
358
- with gr.Accordion("(Result.md)", open=False):
 
359
  markdown_output = gr.Markdown(label="(Result.md)")
 
360
  model_choice = gr.Radio(
361
  choices=["Camel-Doc-OCR-062825", "MonkeyOCR-pro-1.2B", "Megalodon-OCR-Sync-0713", "Qwen2-VL-OCR-2B"],
362
  label="Select Model",
363
  value="Camel-Doc-OCR-062825"
364
  )
 
365
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
 
366
  # Define the submit button actions
367
  image_submit.click(fn=generate_image,
368
  inputs=[
 
19
  MAX_MAX_NEW_TOKENS = 2048
20
  DEFAULT_MAX_NEW_TOKENS = 1024
21
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
22
+
23
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
24
 
25
  # Load Camel-Doc-OCR-062825
 
117
  {"type": "text", "text": text},
118
  ]
119
  }]
 
120
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
121
  inputs = processor(
122
  text=[prompt_full],
 
126
  truncation=False,
127
  max_length=MAX_INPUT_TOKEN_LENGTH
128
  ).to(device)
 
129
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
130
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
131
  thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
132
  thread.start()
 
133
  buffer = ""
134
  for new_text in streamer:
135
  buffer += new_text
 
175
  image, timestamp = frame
176
  messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
177
  messages[1]["content"].append({"type": "image", "image": image})
 
178
  inputs = processor.apply_chat_template(
179
  messages,
180
  tokenize=True,
 
184
  truncation=False,
185
  max_length=MAX_INPUT_TOKEN_LENGTH
186
  ).to(device)
 
187
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
188
  generation_kwargs = {
189
  **inputs,
 
197
  }
198
  thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
199
  thread.start()
 
200
  buffer = ""
201
  for new_text in streamer:
202
  buffer += new_text
 
208
  image_examples = [
209
  ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
210
  ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
211
+ ["explain the movie shot in detail.", "images/3.png"],
212
  ["fill the correct numbers.", "images/4.png"]
213
  ]
214
+
215
  video_examples = [
216
  ["explain the ad video in detail.", "videos/1.mp4"],
217
  ["explain the video in detail.", "videos/2.mp4"]
 
231
  border-radius: 10px;
232
  padding: 20px;
233
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  """
235
 
236
  # Create the Gradio Interface
237
+ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
238
  gr.Markdown("# **[Multimodal OCR Comparator](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
239
  with gr.Row():
240
  with gr.Column():
 
255
  examples=video_examples,
256
  inputs=[video_query, video_upload]
257
  )
258
+
259
  with gr.Accordion("Advanced options", open=False):
260
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
261
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
262
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
263
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
264
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
265
+
266
  with gr.Column():
267
  with gr.Column(elem_classes="canvas-output"):
268
  gr.Markdown("## Output")
269
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
270
+
271
+ with gr.Accordion("(Result.md)", open=False):
272
  markdown_output = gr.Markdown(label="(Result.md)")
273
+
274
  model_choice = gr.Radio(
275
  choices=["Camel-Doc-OCR-062825", "MonkeyOCR-pro-1.2B", "Megalodon-OCR-Sync-0713", "Qwen2-VL-OCR-2B"],
276
  label="Select Model",
277
  value="Camel-Doc-OCR-062825"
278
  )
279
+
280
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
281
+
282
  # Define the submit button actions
283
  image_submit.click(fn=generate_image,
284
  inputs=[