dmitrynovikov7211 commited on
Commit
f3f0948
·
verified ·
1 Parent(s): c91f495

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -79
app.py CHANGED
@@ -226,16 +226,14 @@ def to_pdf(file_path):
226
  return tmp_file_path
227
 
228
 
229
-
230
-
231
  @app.post("/process_document")
232
  async def process_document(
233
  file: UploadFile = File(...),
234
  end_pages: int = 10,
235
  is_ocr: bool = False,
236
- layout_mode: str = "doclayout_yolo",
237
  formula_enable: bool = True,
238
- table_enable: bool = True,
239
  language: str = "auto"
240
  ):
241
  try:
@@ -244,7 +242,7 @@ async def process_document(
244
  content = await file.read()
245
  buffer.write(content)
246
 
247
- # Source 1: magic-pdf для основного текста
248
  md_content, txt_content, archive_zip_path, new_pdf_path = to_markdown(
249
  temp_path,
250
  end_pages=end_pages,
@@ -254,98 +252,89 @@ async def process_document(
254
  table_enable=table_enable,
255
  language=language
256
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
 
 
 
258
  def extract_text_pymupdf(pdf_path):
259
  try:
260
  doc = fitz.open(pdf_path)
261
- text = []
262
- table_text = []
263
-
264
  for page_num in range(min(end_pages, doc.page_count)):
265
  page = doc[page_num]
266
- page_dict = page.get_text("dict", sort=True)
267
-
268
- in_table = False
269
- current_table = []
270
-
271
- for block in page_dict.get("blocks", []):
272
- if block["type"] == 0: # Текстовый блок
273
- block_text = ""
274
- is_table_like = False
275
-
276
- for line in block.get("lines", []):
277
- line_text = ""
278
- spans = line.get("spans", [])
279
-
280
- if len(spans) > 3:
281
- is_table_like = True
282
-
283
- for span in spans:
284
- span_text = span.get("text", "").encode('utf-8', errors='ignore').decode('utf-8')
285
- if any(sep in span_text for sep in ["|", "\t", "│"]):
286
- is_table_like = True
287
- line_text += span_text + " "
288
-
289
- if (re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line_text) or
290
- re.search(r'\d+[.,]\d{2}', line_text) or
291
- re.search(r'[A-Z]{2}\d{2}[A-Z0-9]{4,}', line_text)):
292
- is_table_like = True
293
-
294
- if line_text.strip():
295
- block_text += line_text.strip() + "\n"
296
-
297
- if is_table_like:
298
- if not in_table:
299
- in_table = True
300
- current_table.append(block_text)
301
- else:
302
- if in_table:
303
- table_text.append("\n".join(current_table))
304
- current_table = []
305
- in_table = False
306
- text.append(block_text)
307
-
308
- elif block["type"] == 1:
309
- if in_table:
310
- table_text.append("\n".join(current_table))
311
- current_table = []
312
- in_table = False
313
- table_text.append("<TABLE_DATA>")
314
-
315
  doc.close()
316
- return "\n".join(text), "\n".join(table_text)
317
-
318
  except Exception as e:
319
- logger.error(f"PyMuPDF extraction error: {str(e)}")
320
- return str(e), ""
321
 
322
- source_4_text, source_4_tables = extract_text_pymupdf(temp_path)
323
 
324
- # Возвращаем в формате, который ожидает DeepSeek
325
- # Возвращаем в том формате, который ожидает process_sources_generator
326
- return JSONResponse({
327
- "sources": {
328
- "magic_pdf": {
329
- "text": txt_content
330
- },
331
- "pymupdf": {
332
- "text": f"{source_4_text}\n{source_4_tables}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  }
334
- }
 
 
 
 
 
 
 
 
 
335
  })
336
 
337
  except Exception as e:
338
- logger.error(f"Process document error: {str(e)}", exc_info=True)
339
  return JSONResponse(
340
  status_code=500,
341
  content={"error": str(e)}
342
  )
343
- finally:
344
- try:
345
- if os.path.exists(temp_path):
346
- os.remove(temp_path)
347
- except Exception as e:
348
- logger.error(f"Cleanup error: {str(e)}")
349
 
350
  # Initialize models
351
  model_init = init_model()
 
226
  return tmp_file_path
227
 
228
 
 
 
229
  @app.post("/process_document")
230
  async def process_document(
231
  file: UploadFile = File(...),
232
  end_pages: int = 10,
233
  is_ocr: bool = False,
234
+ layout_mode: str = "doclayout_yolo",
235
  formula_enable: bool = True,
236
+ table_enable: bool = False,
237
  language: str = "auto"
238
  ):
239
  try:
 
242
  content = await file.read()
243
  buffer.write(content)
244
 
245
+ # Source 1: magic-pdf processing
246
  md_content, txt_content, archive_zip_path, new_pdf_path = to_markdown(
247
  temp_path,
248
  end_pages=end_pages,
 
252
  table_enable=table_enable,
253
  language=language
254
  )
255
+ source_1 = txt_content
256
+
257
+ # Source 3: PDFMiner
258
+ def extract_text_pdfminer(pdf_path):
259
+ try:
260
+ laparams = LAParams(
261
+ line_margin=0.5,
262
+ word_margin=0.1,
263
+ char_margin=2.0,
264
+ boxes_flow=0.5,
265
+ detect_vertical=True
266
+ )
267
+ text = extract_text(pdf_path, laparams=laparams)
268
+ return text
269
+ except Exception as e:
270
+ return str(e)
271
 
272
+ source_3 = extract_text_pdfminer(temp_path)
273
+
274
+ # Source 4: PyMuPDF (more precise for tables and structured content)
275
  def extract_text_pymupdf(pdf_path):
276
  try:
277
  doc = fitz.open(pdf_path)
278
+ text = ""
 
 
279
  for page_num in range(min(end_pages, doc.page_count)):
280
  page = doc[page_num]
281
+ # Extract text with preserved formatting
282
+ blocks = page.get_text("blocks")
283
+ # Sort blocks by vertical position then horizontal
284
+ blocks.sort(key=lambda b: (b[1], b[0]))
285
+ for b in blocks:
286
+ text += b[4] + "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
  doc.close()
288
+ return text
 
289
  except Exception as e:
290
+ return str(e)
 
291
 
292
+ source_4 = extract_text_pymupdf(temp_path)
293
 
294
+ # Clean up
295
+ os.remove(temp_path)
296
+
297
+ # Compare and validate results
298
+ def validate_results(sources):
299
+ # Basic validation checks
300
+ validated_results = {}
301
+ for idx, source in sources.items():
302
+ # Check for common banking keywords
303
+ banking_keywords = ['balance', 'deposit', 'withdrawal', 'transaction', 'account']
304
+ keyword_presence = sum(1 for keyword in banking_keywords if keyword.lower() in source.lower())
305
+
306
+ # Check for number patterns (amounts)
307
+ amount_pattern = r'\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
308
+ amounts_found = len(re.findall(amount_pattern, source))
309
+
310
+ # Check for date patterns
311
+ date_pattern = r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}'
312
+ dates_found = len(re.findall(date_pattern, source))
313
+
314
+ validated_results[idx] = {
315
+ 'text': source,
316
+ 'confidence_score': (keyword_presence + amounts_found + dates_found) / 10,
317
+ 'amounts_found': amounts_found,
318
+ 'dates_found': dates_found
319
  }
320
+ return validated_results
321
+
322
+ validated_sources = validate_results({
323
+ 'source_1': source_1,
324
+ 'source_3': source_3,
325
+ 'source_4': source_4
326
+ })
327
+
328
+ return JSONResponse({
329
+ "sources": validated_sources
330
  })
331
 
332
  except Exception as e:
 
333
  return JSONResponse(
334
  status_code=500,
335
  content={"error": str(e)}
336
  )
337
+
 
 
 
 
 
338
 
339
  # Initialize models
340
  model_init = init_model()