Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -226,16 +226,14 @@ def to_pdf(file_path):
|
|
226 |
return tmp_file_path
|
227 |
|
228 |
|
229 |
-
|
230 |
-
|
231 |
@app.post("/process_document")
|
232 |
async def process_document(
|
233 |
file: UploadFile = File(...),
|
234 |
end_pages: int = 10,
|
235 |
is_ocr: bool = False,
|
236 |
-
layout_mode: str = "doclayout_yolo",
|
237 |
formula_enable: bool = True,
|
238 |
-
table_enable: bool =
|
239 |
language: str = "auto"
|
240 |
):
|
241 |
try:
|
@@ -244,7 +242,7 @@ async def process_document(
|
|
244 |
content = await file.read()
|
245 |
buffer.write(content)
|
246 |
|
247 |
-
# Source 1: magic-pdf
|
248 |
md_content, txt_content, archive_zip_path, new_pdf_path = to_markdown(
|
249 |
temp_path,
|
250 |
end_pages=end_pages,
|
@@ -254,98 +252,89 @@ async def process_document(
|
|
254 |
table_enable=table_enable,
|
255 |
language=language
|
256 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
|
|
|
|
|
|
|
258 |
def extract_text_pymupdf(pdf_path):
|
259 |
try:
|
260 |
doc = fitz.open(pdf_path)
|
261 |
-
text =
|
262 |
-
table_text = []
|
263 |
-
|
264 |
for page_num in range(min(end_pages, doc.page_count)):
|
265 |
page = doc[page_num]
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
if block["type"] == 0: # Текстовый блок
|
273 |
-
block_text = ""
|
274 |
-
is_table_like = False
|
275 |
-
|
276 |
-
for line in block.get("lines", []):
|
277 |
-
line_text = ""
|
278 |
-
spans = line.get("spans", [])
|
279 |
-
|
280 |
-
if len(spans) > 3:
|
281 |
-
is_table_like = True
|
282 |
-
|
283 |
-
for span in spans:
|
284 |
-
span_text = span.get("text", "").encode('utf-8', errors='ignore').decode('utf-8')
|
285 |
-
if any(sep in span_text for sep in ["|", "\t", "│"]):
|
286 |
-
is_table_like = True
|
287 |
-
line_text += span_text + " "
|
288 |
-
|
289 |
-
if (re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line_text) or
|
290 |
-
re.search(r'\d+[.,]\d{2}', line_text) or
|
291 |
-
re.search(r'[A-Z]{2}\d{2}[A-Z0-9]{4,}', line_text)):
|
292 |
-
is_table_like = True
|
293 |
-
|
294 |
-
if line_text.strip():
|
295 |
-
block_text += line_text.strip() + "\n"
|
296 |
-
|
297 |
-
if is_table_like:
|
298 |
-
if not in_table:
|
299 |
-
in_table = True
|
300 |
-
current_table.append(block_text)
|
301 |
-
else:
|
302 |
-
if in_table:
|
303 |
-
table_text.append("\n".join(current_table))
|
304 |
-
current_table = []
|
305 |
-
in_table = False
|
306 |
-
text.append(block_text)
|
307 |
-
|
308 |
-
elif block["type"] == 1:
|
309 |
-
if in_table:
|
310 |
-
table_text.append("\n".join(current_table))
|
311 |
-
current_table = []
|
312 |
-
in_table = False
|
313 |
-
table_text.append("<TABLE_DATA>")
|
314 |
-
|
315 |
doc.close()
|
316 |
-
return
|
317 |
-
|
318 |
except Exception as e:
|
319 |
-
|
320 |
-
return str(e), ""
|
321 |
|
322 |
-
|
323 |
|
324 |
-
#
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
}
|
334 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
335 |
})
|
336 |
|
337 |
except Exception as e:
|
338 |
-
logger.error(f"Process document error: {str(e)}", exc_info=True)
|
339 |
return JSONResponse(
|
340 |
status_code=500,
|
341 |
content={"error": str(e)}
|
342 |
)
|
343 |
-
|
344 |
-
try:
|
345 |
-
if os.path.exists(temp_path):
|
346 |
-
os.remove(temp_path)
|
347 |
-
except Exception as e:
|
348 |
-
logger.error(f"Cleanup error: {str(e)}")
|
349 |
|
350 |
# Initialize models
|
351 |
model_init = init_model()
|
|
|
226 |
return tmp_file_path
|
227 |
|
228 |
|
|
|
|
|
229 |
@app.post("/process_document")
|
230 |
async def process_document(
|
231 |
file: UploadFile = File(...),
|
232 |
end_pages: int = 10,
|
233 |
is_ocr: bool = False,
|
234 |
+
layout_mode: str = "doclayout_yolo",
|
235 |
formula_enable: bool = True,
|
236 |
+
table_enable: bool = False,
|
237 |
language: str = "auto"
|
238 |
):
|
239 |
try:
|
|
|
242 |
content = await file.read()
|
243 |
buffer.write(content)
|
244 |
|
245 |
+
# Source 1: magic-pdf processing
|
246 |
md_content, txt_content, archive_zip_path, new_pdf_path = to_markdown(
|
247 |
temp_path,
|
248 |
end_pages=end_pages,
|
|
|
252 |
table_enable=table_enable,
|
253 |
language=language
|
254 |
)
|
255 |
+
source_1 = txt_content
|
256 |
+
|
257 |
+
# Source 3: PDFMiner
|
258 |
+
def extract_text_pdfminer(pdf_path):
|
259 |
+
try:
|
260 |
+
laparams = LAParams(
|
261 |
+
line_margin=0.5,
|
262 |
+
word_margin=0.1,
|
263 |
+
char_margin=2.0,
|
264 |
+
boxes_flow=0.5,
|
265 |
+
detect_vertical=True
|
266 |
+
)
|
267 |
+
text = extract_text(pdf_path, laparams=laparams)
|
268 |
+
return text
|
269 |
+
except Exception as e:
|
270 |
+
return str(e)
|
271 |
|
272 |
+
source_3 = extract_text_pdfminer(temp_path)
|
273 |
+
|
274 |
+
# Source 4: PyMuPDF (more precise for tables and structured content)
|
275 |
def extract_text_pymupdf(pdf_path):
|
276 |
try:
|
277 |
doc = fitz.open(pdf_path)
|
278 |
+
text = ""
|
|
|
|
|
279 |
for page_num in range(min(end_pages, doc.page_count)):
|
280 |
page = doc[page_num]
|
281 |
+
# Extract text with preserved formatting
|
282 |
+
blocks = page.get_text("blocks")
|
283 |
+
# Sort blocks by vertical position then horizontal
|
284 |
+
blocks.sort(key=lambda b: (b[1], b[0]))
|
285 |
+
for b in blocks:
|
286 |
+
text += b[4] + "\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
doc.close()
|
288 |
+
return text
|
|
|
289 |
except Exception as e:
|
290 |
+
return str(e)
|
|
|
291 |
|
292 |
+
source_4 = extract_text_pymupdf(temp_path)
|
293 |
|
294 |
+
# Clean up
|
295 |
+
os.remove(temp_path)
|
296 |
+
|
297 |
+
# Compare and validate results
|
298 |
+
def validate_results(sources):
|
299 |
+
# Basic validation checks
|
300 |
+
validated_results = {}
|
301 |
+
for idx, source in sources.items():
|
302 |
+
# Check for common banking keywords
|
303 |
+
banking_keywords = ['balance', 'deposit', 'withdrawal', 'transaction', 'account']
|
304 |
+
keyword_presence = sum(1 for keyword in banking_keywords if keyword.lower() in source.lower())
|
305 |
+
|
306 |
+
# Check for number patterns (amounts)
|
307 |
+
amount_pattern = r'\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
|
308 |
+
amounts_found = len(re.findall(amount_pattern, source))
|
309 |
+
|
310 |
+
# Check for date patterns
|
311 |
+
date_pattern = r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}'
|
312 |
+
dates_found = len(re.findall(date_pattern, source))
|
313 |
+
|
314 |
+
validated_results[idx] = {
|
315 |
+
'text': source,
|
316 |
+
'confidence_score': (keyword_presence + amounts_found + dates_found) / 10,
|
317 |
+
'amounts_found': amounts_found,
|
318 |
+
'dates_found': dates_found
|
319 |
}
|
320 |
+
return validated_results
|
321 |
+
|
322 |
+
validated_sources = validate_results({
|
323 |
+
'source_1': source_1,
|
324 |
+
'source_3': source_3,
|
325 |
+
'source_4': source_4
|
326 |
+
})
|
327 |
+
|
328 |
+
return JSONResponse({
|
329 |
+
"sources": validated_sources
|
330 |
})
|
331 |
|
332 |
except Exception as e:
|
|
|
333 |
return JSONResponse(
|
334 |
status_code=500,
|
335 |
content={"error": str(e)}
|
336 |
)
|
337 |
+
|
|
|
|
|
|
|
|
|
|
|
338 |
|
339 |
# Initialize models
|
340 |
model_init = init_model()
|