prithivMLmods commited on
Commit
5ecf0f4
Β·
verified Β·
1 Parent(s): 1774f71

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -24
app.py CHANGED
@@ -17,6 +17,8 @@ from transformers import (
17
  Qwen2_5_VLForConditionalGeneration,
18
  AutoProcessor,
19
  TextIteratorStreamer,
 
 
20
  )
21
 
22
  js_func = """
@@ -77,6 +79,17 @@ model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
77
  MODEL_ID_G, trust_remote_code=True, subfolder=SUBFOLDER, torch_dtype=torch.float16
78
  ).to(device).eval()
79
 
 
 
 
 
 
 
 
 
 
 
 
80
  # --- Utility Functions ---
81
  def layoutjson2md(layout_data: List[Dict]) -> str:
82
  """Converts the structured JSON from Layout Analysis into formatted Markdown."""
@@ -121,6 +134,28 @@ def process_document_stream(model_name: str, task_choice: str, image: Image.Imag
121
  # 1. Select prompt based on user's task choice
122
  text_prompt = ocr_prompt if task_choice == "Content Extraction" else layout_prompt
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  # 2. Select model and processor
125
  if model_name == "Camel-Doc-OCR-062825": processor, model = processor_m, model_m
126
  elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
@@ -136,10 +171,10 @@ def process_document_stream(model_name: str, task_choice: str, image: Image.Imag
136
  inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
137
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
138
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
139
-
140
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
141
  thread.start()
142
-
143
  # 4. Stream raw output to the UI in real-time
144
  buffer = ""
145
  for new_text in streamer:
@@ -157,11 +192,11 @@ def process_document_stream(model_name: str, task_choice: str, image: Image.Imag
157
  json_match = re.search(r'```json\s*([\s\S]+?)\s*```', buffer)
158
  if not json_match:
159
  raise json.JSONDecodeError("JSON object not found in output.", buffer, 0)
160
-
161
  json_str = json_match.group(1)
162
  layout_data = json.loads(json_str)
163
  markdown_content = layoutjson2md(layout_data)
164
-
165
  yield buffer, markdown_content, layout_data
166
  except Exception as e:
167
  error_md = f"❌ **Error:** Failed to parse Layout JSON.\n\n**Details:**\n`{str(e)}`"
@@ -173,7 +208,7 @@ def create_gradio_interface():
173
  """Builds and returns the Gradio web interface."""
174
  css = """
175
  .main-container { max-width: 1400px; margin: 0 auto; }
176
- .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
177
  .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
178
  """
179
  with gr.Blocks(theme="bethecloud/storj_theme", css=css, js=js_func) as demo:
@@ -185,15 +220,16 @@ def create_gradio_interface():
185
  </p>
186
  </div>
187
  """)
188
-
189
  with gr.Row():
190
  # Left Column (Inputs)
191
  with gr.Column(scale=1):
192
  model_choice = gr.Dropdown(
193
- choices=["Camel-Doc-OCR-062825",
194
- "MonkeyOCR-Recognition",
195
- "Nanonets-OCR-s",
196
- "Megalodon-OCR-Sync-0713"],
 
197
  label="Select Model", value="Nanonets-OCR-s"
198
  )
199
  task_choice = gr.Dropdown(
@@ -203,7 +239,7 @@ def create_gradio_interface():
203
  image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
204
  with gr.Accordion("Advanced Settings", open=False):
205
  max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=4096, step=256, label="Max New Tokens")
206
-
207
  process_btn = gr.Button("πŸš€ Process Document", variant="primary", elem_classes=["process-button"], size="lg")
208
  clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
209
 
@@ -217,37 +253,37 @@ def create_gradio_interface():
217
  examples=["examples/example_img2.png", "examples/example_img1.png"],
218
  inputs=image_input,
219
  label="Examples"
220
- )
221
- with gr.Tab("πŸ“° README.md"):
222
  with gr.Accordion("(Formatted Result)", open=True):
223
  markdown_output = gr.Markdown(label="Formatted Markdown")
224
-
225
  with gr.Tab("πŸ“‹ Layout Analysis Results"):
226
  json_output = gr.JSON(label="Structured Layout Data (JSON)")
227
-
228
  # Event Handlers
229
  def clear_all_outputs():
230
  return None, "Raw output will appear here.", "Formatted results will appear here.", None
231
 
232
  process_btn.click(
233
  fn=process_document_stream,
234
- inputs=[model_choice,
235
- task_choice,
236
- image_input,
237
  max_new_tokens],
238
- outputs=[raw_output_stream,
239
- markdown_output,
240
  json_output]
241
  )
242
  clear_btn.click(
243
  clear_all_outputs,
244
- outputs=[image_input,
245
- raw_output_stream,
246
- markdown_output,
247
  json_output]
248
  )
249
  return demo
250
 
251
  if __name__ == "__main__":
252
  demo = create_gradio_interface()
253
- demo.queue().launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
17
  Qwen2_5_VLForConditionalGeneration,
18
  AutoProcessor,
19
  TextIteratorStreamer,
20
+ AutoModel,
21
+ AutoTokenizer
22
  )
23
 
24
  js_func = """
 
79
  MODEL_ID_G, trust_remote_code=True, subfolder=SUBFOLDER, torch_dtype=torch.float16
80
  ).to(device).eval()
81
 
82
+ # --- New Model ---
83
+ MODEL_ID_V4 = 'openbmb/MiniCPM-V-4'
84
+ model_v4 = AutoModel.from_pretrained(
85
+ MODEL_ID_V4,
86
+ trust_remote_code=True,
87
+ torch_dtype=torch.bfloat16,
88
+ attn_implementation='sdpa' # Use 'flash_attention_2' if available and supported
89
+ ).eval().to(device)
90
+ tokenizer_v4 = AutoTokenizer.from_pretrained(MODEL_ID_V4, trust_remote_code=True)
91
+
92
+
93
  # --- Utility Functions ---
94
  def layoutjson2md(layout_data: List[Dict]) -> str:
95
  """Converts the structured JSON from Layout Analysis into formatted Markdown."""
 
134
  # 1. Select prompt based on user's task choice
135
  text_prompt = ocr_prompt if task_choice == "Content Extraction" else layout_prompt
136
 
137
+ # --- New Model Handling ---
138
+ if model_name == "openbmb/MiniCPM-V-4":
139
+ if task_choice == "Layout Analysis(.json)":
140
+ yield "This model is not optimized for Layout Analysis.", "Task not supported for this model.", None
141
+ return
142
+
143
+ question = "What is in this image?"
144
+ msgs = [{'role': 'user', 'content': [image, question]}]
145
+
146
+ # Since this model's .chat method isn't a generator, we run it in a thread
147
+ # and yield the final result. A more advanced implementation could stream it.
148
+ try:
149
+ answer = model_v4.chat(
150
+ image=image.convert('RGB'),
151
+ msgs=msgs,
152
+ tokenizer=tokenizer_v4
153
+ )
154
+ yield answer, answer, None
155
+ except Exception as e:
156
+ yield f"Error: {str(e)}", "An error occurred.", None
157
+ return
158
+
159
  # 2. Select model and processor
160
  if model_name == "Camel-Doc-OCR-062825": processor, model = processor_m, model_m
161
  elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
 
171
  inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
172
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
173
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
174
+
175
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
176
  thread.start()
177
+
178
  # 4. Stream raw output to the UI in real-time
179
  buffer = ""
180
  for new_text in streamer:
 
192
  json_match = re.search(r'```json\s*([\s\S]+?)\s*```', buffer)
193
  if not json_match:
194
  raise json.JSONDecodeError("JSON object not found in output.", buffer, 0)
195
+
196
  json_str = json_match.group(1)
197
  layout_data = json.loads(json_str)
198
  markdown_content = layoutjson2md(layout_data)
199
+
200
  yield buffer, markdown_content, layout_data
201
  except Exception as e:
202
  error_md = f"❌ **Error:** Failed to parse Layout JSON.\n\n**Details:**\n`{str(e)}`"
 
208
  """Builds and returns the Gradio web interface."""
209
  css = """
210
  .main-container { max-width: 1400px; margin: 0 auto; }
211
+ .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
212
  .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
213
  """
214
  with gr.Blocks(theme="bethecloud/storj_theme", css=css, js=js_func) as demo:
 
220
  </p>
221
  </div>
222
  """)
223
+
224
  with gr.Row():
225
  # Left Column (Inputs)
226
  with gr.Column(scale=1):
227
  model_choice = gr.Dropdown(
228
+ choices=["Camel-Doc-OCR-062825",
229
+ "MonkeyOCR-Recognition",
230
+ "Nanonets-OCR-s",
231
+ "Megalodon-OCR-Sync-0713",
232
+ "openbmb/MiniCPM-V-4"],
233
  label="Select Model", value="Nanonets-OCR-s"
234
  )
235
  task_choice = gr.Dropdown(
 
239
  image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
240
  with gr.Accordion("Advanced Settings", open=False):
241
  max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=4096, step=256, label="Max New Tokens")
242
+
243
  process_btn = gr.Button("πŸš€ Process Document", variant="primary", elem_classes=["process-button"], size="lg")
244
  clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
245
 
 
253
  examples=["examples/example_img2.png", "examples/example_img1.png"],
254
  inputs=image_input,
255
  label="Examples"
256
+ )
257
+ with gr.Tab("πŸ“° README.md"):
258
  with gr.Accordion("(Formatted Result)", open=True):
259
  markdown_output = gr.Markdown(label="Formatted Markdown")
260
+
261
  with gr.Tab("πŸ“‹ Layout Analysis Results"):
262
  json_output = gr.JSON(label="Structured Layout Data (JSON)")
263
+
264
  # Event Handlers
265
  def clear_all_outputs():
266
  return None, "Raw output will appear here.", "Formatted results will appear here.", None
267
 
268
  process_btn.click(
269
  fn=process_document_stream,
270
+ inputs=[model_choice,
271
+ task_choice,
272
+ image_input,
273
  max_new_tokens],
274
+ outputs=[raw_output_stream,
275
+ markdown_output,
276
  json_output]
277
  )
278
  clear_btn.click(
279
  clear_all_outputs,
280
+ outputs=[image_input,
281
+ raw_output_stream,
282
+ markdown_output,
283
  json_output]
284
  )
285
  return demo
286
 
287
  if __name__ == "__main__":
288
  demo = create_gradio_interface()
289
+ demo.queue().launch(share=True, ssr_mode=False, show_error=True)