prithivMLmods commited on
Commit
9017b27
·
verified ·
1 Parent(s): 70ecff9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -25
app.py CHANGED
@@ -48,8 +48,8 @@ model_x = Qwen2VLForConditionalGeneration.from_pretrained(
48
  torch_dtype=torch.float16
49
  ).to(device).eval()
50
 
51
- #--------------------------------------------------------------------------------------#
52
- #Load MonkeyOCR
53
  MODEL_ID_G = "echo840/MonkeyOCR"
54
  SUBFOLDER = "Recognition"
55
 
@@ -65,7 +65,7 @@ model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
65
  subfolder=SUBFOLDER,
66
  torch_dtype=torch.float16
67
  ).to(device).eval()
68
- #--------------------------------------------------------------------------------------#
69
 
70
  # Load GLM-4.1V-9B-Thinking
71
  MODEL_ID_O = "THUDM/GLM-4.1V-9B-Thinking"
@@ -106,6 +106,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
106
  repetition_penalty: float = 1.2):
107
  """
108
  Generates responses using the selected model for image input.
 
109
  """
110
  if model_name == "docscopeOCR-7B-050425-exp":
111
  processor = processor_m
@@ -120,11 +121,11 @@ def generate_image(model_name: str, text: str, image: Image.Image,
120
  processor = processor_o
121
  model = model_o
122
  else:
123
- yield "Invalid model selected."
124
  return
125
 
126
  if image is None:
127
- yield "Please upload an image."
128
  return
129
 
130
  messages = [{
@@ -152,7 +153,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
152
  buffer += new_text
153
  buffer = buffer.replace("<|im_end|>", "")
154
  time.sleep(0.01)
155
- yield buffer
156
 
157
  @spaces.GPU
158
  def generate_video(model_name: str, text: str, video_path: str,
@@ -163,6 +164,7 @@ def generate_video(model_name: str, text: str, video_path: str,
163
  repetition_penalty: float = 1.2):
164
  """
165
  Generates responses using the selected model for video input.
 
166
  """
167
  if model_name == "docscopeOCR-7B-050425-exp":
168
  processor = processor_m
@@ -177,11 +179,11 @@ def generate_video(model_name: str, text: str, video_path: str,
177
  processor = processor_o
178
  model = model_o
179
  else:
180
- yield "Invalid model selected."
181
  return
182
 
183
  if video_path is None:
184
- yield "Please upload a video."
185
  return
186
 
187
  frames = downsample_video(video_path)
@@ -220,18 +222,19 @@ def generate_video(model_name: str, text: str, video_path: str,
220
  buffer += new_text
221
  buffer = buffer.replace("<|im_end|>", "")
222
  time.sleep(0.01)
223
- yield buffer
224
 
225
  # Define examples for image and video inference
226
  image_examples = [
227
- ["fill the correct numbers", "example/image3.png"],
228
- ["ocr the image", "example/image1.png"],
229
- ["explain the scene", "example/image2.jpg"],
 
230
  ]
231
 
232
  video_examples = [
233
- ["Explain the ad in detail", "example/1.mp4"],
234
- ["Identify the main actions in the coca cola ad...", "example/2.mp4"]
235
  ]
236
 
237
  css = """
@@ -242,6 +245,11 @@ css = """
242
  .submit-btn:hover {
243
  background-color: #3498db !important;
244
  }
 
 
 
 
 
245
  """
246
 
247
  # Create the Gradio Interface
@@ -271,29 +279,37 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
271
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
272
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
273
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
274
- repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
 
275
  with gr.Column():
276
- output = gr.Textbox(label="Output", interactive=False, lines=3, scale=2)
277
- model_choice = gr.Radio(
278
- choices=["docscopeOCR-7B-050425-exp", "GLM-4.1V-9B-Thinking", "MonkeyOCR-Recognition", "coreOCR-7B-050325-preview"],
 
 
 
 
 
 
279
  label="Select Model",
280
- value="docscopeOCR-7B-050425-exp"
281
  )
282
-
283
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/core-OCR/discussions)")
 
284
  gr.Markdown("> [docscopeOCR-7B-050425-exp](https://huggingface.co/prithivMLmods/docscopeOCR-7B-050425-exp): The docscopeOCR-7B-050425-exp model is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Document-Level Optical Character Recognition (OCR), long-context vision-language understanding, and accurate image-to-text conversion with mathematical LaTeX formatting.")
285
  gr.Markdown("> [MonkeyOCR](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
286
  gr.Markdown("> [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): The coreOCR-7B-050325-preview model is a fine-tuned version of Qwen2-VL-7B, optimized for Document-Level Optical Character Recognition (OCR), long-context vision-language understanding, and accurate image-to-text conversion with mathematical LaTeX formatting.")
287
-
 
288
  image_submit.click(
289
  fn=generate_image,
290
- inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
291
- outputs=output
292
  )
293
  video_submit.click(
294
  fn=generate_video,
295
- inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
296
- outputs=output
297
  )
298
 
299
  if __name__ == "__main__":
 
48
  torch_dtype=torch.float16
49
  ).to(device).eval()
50
 
51
+ #-----------------------------subfolder-----------------------------#
52
+ # Load MonkeyOCR
53
  MODEL_ID_G = "echo840/MonkeyOCR"
54
  SUBFOLDER = "Recognition"
55
 
 
65
  subfolder=SUBFOLDER,
66
  torch_dtype=torch.float16
67
  ).to(device).eval()
68
+ #-----------------------------subfolder-----------------------------#
69
 
70
  # Load GLM-4.1V-9B-Thinking
71
  MODEL_ID_O = "THUDM/GLM-4.1V-9B-Thinking"
 
106
  repetition_penalty: float = 1.2):
107
  """
108
  Generates responses using the selected model for image input.
109
+ Yields raw text and Markdown-formatted text.
110
  """
111
  if model_name == "docscopeOCR-7B-050425-exp":
112
  processor = processor_m
 
121
  processor = processor_o
122
  model = model_o
123
  else:
124
+ yield "Invalid model selected.", "Invalid model selected."
125
  return
126
 
127
  if image is None:
128
+ yield "Please upload an image.", "Please upload an image."
129
  return
130
 
131
  messages = [{
 
153
  buffer += new_text
154
  buffer = buffer.replace("<|im_end|>", "")
155
  time.sleep(0.01)
156
+ yield buffer, buffer
157
 
158
  @spaces.GPU
159
  def generate_video(model_name: str, text: str, video_path: str,
 
164
  repetition_penalty: float = 1.2):
165
  """
166
  Generates responses using the selected model for video input.
167
+ Yields raw text and Markdown-formatted text.
168
  """
169
  if model_name == "docscopeOCR-7B-050425-exp":
170
  processor = processor_m
 
179
  processor = processor_o
180
  model = model_o
181
  else:
182
+ yield "Invalid model selected.", "Invalid model selected."
183
  return
184
 
185
  if video_path is None:
186
+ yield "Please upload a video.", "Please upload a video."
187
  return
188
 
189
  frames = downsample_video(video_path)
 
222
  buffer += new_text
223
  buffer = buffer.replace("<|im_end|>", "")
224
  time.sleep(0.01)
225
+ yield buffer, buffer
226
 
227
  # Define examples for image and video inference
228
  image_examples = [
229
+ ["Extract it as a table for README.md", "images/image0.jpg"],
230
+ ["Fill the correct numbers", "images/image3.png"],
231
+ ["OCR the image", "images/image1.png"],
232
+ ["Explain the scene", "images/image2.jpg"],
233
  ]
234
 
235
  video_examples = [
236
+ ["Explain the video in detail", "videos/1.mp4"],
237
+ ["Explain the video in detail", "videos/2.mp4"]
238
  ]
239
 
240
  css = """
 
245
  .submit-btn:hover {
246
  background-color: #3498db !important;
247
  }
248
+ .canvas-output {
249
+ border: 2px solid #4682B4;
250
+ border-radius: 10px;
251
+ padding: 20px;
252
+ }
253
  """
254
 
255
  # Create the Gradio Interface
 
279
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
280
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
281
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
282
+ repetition_cost = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
283
+
284
  with gr.Column():
285
+ with gr.Column(elem_classes="canvas-output"):
286
+ gr.Markdown("## Result.Md")
287
+ output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
288
+
289
+ with gr.Accordion("Formatted Result (Result.md)", open=False):
290
+ markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
291
+
292
+ model_choice = Gradio.Radio(
293
+ choices=["GLM-4.1V-9B-Thinking", "docscopeOCR-7B-050425-exp", "MonkeyOCR-Recognition", "coreOCR-7B-050325-preview"],
294
  label="Select Model",
295
+ value="GLM-4.1V-9B-Thinking"
296
  )
 
297
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/core-OCR/discussions)")
298
+ gr.Markdown("> [GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking): GLM-4.1V-9B-Thinking, designed to explore the upper limits of reasoning in vision-language models. By introducing a "thinking paradigm" and leveraging reinforcement learning, the model significantly enhances its capabilities. It achieves state-of-the-art performance among 10B-parameter VLMs.")
299
  gr.Markdown("> [docscopeOCR-7B-050425-exp](https://huggingface.co/prithivMLmods/docscopeOCR-7B-050425-exp): The docscopeOCR-7B-050425-exp model is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Document-Level Optical Character Recognition (OCR), long-context vision-language understanding, and accurate image-to-text conversion with mathematical LaTeX formatting.")
300
  gr.Markdown("> [MonkeyOCR](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
301
  gr.Markdown("> [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): The coreOCR-7B-050325-preview model is a fine-tuned version of Qwen2-VL-7B, optimized for Document-Level Optical Character Recognition (OCR), long-context vision-language understanding, and accurate image-to-text conversion with mathematical LaTeX formatting.")
302
+ gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
303
+
304
  image_submit.click(
305
  fn=generate_image,
306
+ inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_cost],
307
+ outputs=[output, markdown_output]
308
  )
309
  video_submit.click(
310
  fn=generate_video,
311
+ inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_cost],
312
+ outputs=[output, markdown_output]
313
  )
314
 
315
  if __name__ == "__main__":