Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -226,16 +226,14 @@ def to_pdf(file_path):
|
|
| 226 |
return tmp_file_path
|
| 227 |
|
| 228 |
|
| 229 |
-
|
| 230 |
-
|
| 231 |
@app.post("/process_document")
|
| 232 |
async def process_document(
|
| 233 |
file: UploadFile = File(...),
|
| 234 |
end_pages: int = 10,
|
| 235 |
is_ocr: bool = False,
|
| 236 |
-
layout_mode: str = "doclayout_yolo",
|
| 237 |
formula_enable: bool = True,
|
| 238 |
-
table_enable: bool =
|
| 239 |
language: str = "auto"
|
| 240 |
):
|
| 241 |
try:
|
|
@@ -244,7 +242,7 @@ async def process_document(
|
|
| 244 |
content = await file.read()
|
| 245 |
buffer.write(content)
|
| 246 |
|
| 247 |
-
# Source 1: magic-pdf
|
| 248 |
md_content, txt_content, archive_zip_path, new_pdf_path = to_markdown(
|
| 249 |
temp_path,
|
| 250 |
end_pages=end_pages,
|
|
@@ -254,98 +252,89 @@ async def process_document(
|
|
| 254 |
table_enable=table_enable,
|
| 255 |
language=language
|
| 256 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
|
|
|
|
|
|
|
|
|
| 258 |
def extract_text_pymupdf(pdf_path):
|
| 259 |
try:
|
| 260 |
doc = fitz.open(pdf_path)
|
| 261 |
-
text =
|
| 262 |
-
table_text = []
|
| 263 |
-
|
| 264 |
for page_num in range(min(end_pages, doc.page_count)):
|
| 265 |
page = doc[page_num]
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
if block["type"] == 0: # Текстовый блок
|
| 273 |
-
block_text = ""
|
| 274 |
-
is_table_like = False
|
| 275 |
-
|
| 276 |
-
for line in block.get("lines", []):
|
| 277 |
-
line_text = ""
|
| 278 |
-
spans = line.get("spans", [])
|
| 279 |
-
|
| 280 |
-
if len(spans) > 3:
|
| 281 |
-
is_table_like = True
|
| 282 |
-
|
| 283 |
-
for span in spans:
|
| 284 |
-
span_text = span.get("text", "").encode('utf-8', errors='ignore').decode('utf-8')
|
| 285 |
-
if any(sep in span_text for sep in ["|", "\t", "│"]):
|
| 286 |
-
is_table_like = True
|
| 287 |
-
line_text += span_text + " "
|
| 288 |
-
|
| 289 |
-
if (re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line_text) or
|
| 290 |
-
re.search(r'\d+[.,]\d{2}', line_text) or
|
| 291 |
-
re.search(r'[A-Z]{2}\d{2}[A-Z0-9]{4,}', line_text)):
|
| 292 |
-
is_table_like = True
|
| 293 |
-
|
| 294 |
-
if line_text.strip():
|
| 295 |
-
block_text += line_text.strip() + "\n"
|
| 296 |
-
|
| 297 |
-
if is_table_like:
|
| 298 |
-
if not in_table:
|
| 299 |
-
in_table = True
|
| 300 |
-
current_table.append(block_text)
|
| 301 |
-
else:
|
| 302 |
-
if in_table:
|
| 303 |
-
table_text.append("\n".join(current_table))
|
| 304 |
-
current_table = []
|
| 305 |
-
in_table = False
|
| 306 |
-
text.append(block_text)
|
| 307 |
-
|
| 308 |
-
elif block["type"] == 1:
|
| 309 |
-
if in_table:
|
| 310 |
-
table_text.append("\n".join(current_table))
|
| 311 |
-
current_table = []
|
| 312 |
-
in_table = False
|
| 313 |
-
table_text.append("<TABLE_DATA>")
|
| 314 |
-
|
| 315 |
doc.close()
|
| 316 |
-
return
|
| 317 |
-
|
| 318 |
except Exception as e:
|
| 319 |
-
|
| 320 |
-
return str(e), ""
|
| 321 |
|
| 322 |
-
|
| 323 |
|
| 324 |
-
#
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
}
|
| 334 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
})
|
| 336 |
|
| 337 |
except Exception as e:
|
| 338 |
-
logger.error(f"Process document error: {str(e)}", exc_info=True)
|
| 339 |
return JSONResponse(
|
| 340 |
status_code=500,
|
| 341 |
content={"error": str(e)}
|
| 342 |
)
|
| 343 |
-
|
| 344 |
-
try:
|
| 345 |
-
if os.path.exists(temp_path):
|
| 346 |
-
os.remove(temp_path)
|
| 347 |
-
except Exception as e:
|
| 348 |
-
logger.error(f"Cleanup error: {str(e)}")
|
| 349 |
|
| 350 |
# Initialize models
|
| 351 |
model_init = init_model()
|
|
|
|
| 226 |
return tmp_file_path
|
| 227 |
|
| 228 |
|
|
|
|
|
|
|
| 229 |
@app.post("/process_document")
|
| 230 |
async def process_document(
|
| 231 |
file: UploadFile = File(...),
|
| 232 |
end_pages: int = 10,
|
| 233 |
is_ocr: bool = False,
|
| 234 |
+
layout_mode: str = "doclayout_yolo",
|
| 235 |
formula_enable: bool = True,
|
| 236 |
+
table_enable: bool = False,
|
| 237 |
language: str = "auto"
|
| 238 |
):
|
| 239 |
try:
|
|
|
|
| 242 |
content = await file.read()
|
| 243 |
buffer.write(content)
|
| 244 |
|
| 245 |
+
# Source 1: magic-pdf processing
|
| 246 |
md_content, txt_content, archive_zip_path, new_pdf_path = to_markdown(
|
| 247 |
temp_path,
|
| 248 |
end_pages=end_pages,
|
|
|
|
| 252 |
table_enable=table_enable,
|
| 253 |
language=language
|
| 254 |
)
|
| 255 |
+
source_1 = txt_content
|
| 256 |
+
|
| 257 |
+
# Source 3: PDFMiner
|
| 258 |
+
def extract_text_pdfminer(pdf_path):
|
| 259 |
+
try:
|
| 260 |
+
laparams = LAParams(
|
| 261 |
+
line_margin=0.5,
|
| 262 |
+
word_margin=0.1,
|
| 263 |
+
char_margin=2.0,
|
| 264 |
+
boxes_flow=0.5,
|
| 265 |
+
detect_vertical=True
|
| 266 |
+
)
|
| 267 |
+
text = extract_text(pdf_path, laparams=laparams)
|
| 268 |
+
return text
|
| 269 |
+
except Exception as e:
|
| 270 |
+
return str(e)
|
| 271 |
|
| 272 |
+
source_3 = extract_text_pdfminer(temp_path)
|
| 273 |
+
|
| 274 |
+
# Source 4: PyMuPDF (more precise for tables and structured content)
|
| 275 |
def extract_text_pymupdf(pdf_path):
|
| 276 |
try:
|
| 277 |
doc = fitz.open(pdf_path)
|
| 278 |
+
text = ""
|
|
|
|
|
|
|
| 279 |
for page_num in range(min(end_pages, doc.page_count)):
|
| 280 |
page = doc[page_num]
|
| 281 |
+
# Extract text with preserved formatting
|
| 282 |
+
blocks = page.get_text("blocks")
|
| 283 |
+
# Sort blocks by vertical position then horizontal
|
| 284 |
+
blocks.sort(key=lambda b: (b[1], b[0]))
|
| 285 |
+
for b in blocks:
|
| 286 |
+
text += b[4] + "\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
doc.close()
|
| 288 |
+
return text
|
|
|
|
| 289 |
except Exception as e:
|
| 290 |
+
return str(e)
|
|
|
|
| 291 |
|
| 292 |
+
source_4 = extract_text_pymupdf(temp_path)
|
| 293 |
|
| 294 |
+
# Clean up
|
| 295 |
+
os.remove(temp_path)
|
| 296 |
+
|
| 297 |
+
# Compare and validate results
|
| 298 |
+
def validate_results(sources):
|
| 299 |
+
# Basic validation checks
|
| 300 |
+
validated_results = {}
|
| 301 |
+
for idx, source in sources.items():
|
| 302 |
+
# Check for common banking keywords
|
| 303 |
+
banking_keywords = ['balance', 'deposit', 'withdrawal', 'transaction', 'account']
|
| 304 |
+
keyword_presence = sum(1 for keyword in banking_keywords if keyword.lower() in source.lower())
|
| 305 |
+
|
| 306 |
+
# Check for number patterns (amounts)
|
| 307 |
+
amount_pattern = r'\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
|
| 308 |
+
amounts_found = len(re.findall(amount_pattern, source))
|
| 309 |
+
|
| 310 |
+
# Check for date patterns
|
| 311 |
+
date_pattern = r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}'
|
| 312 |
+
dates_found = len(re.findall(date_pattern, source))
|
| 313 |
+
|
| 314 |
+
validated_results[idx] = {
|
| 315 |
+
'text': source,
|
| 316 |
+
'confidence_score': (keyword_presence + amounts_found + dates_found) / 10,
|
| 317 |
+
'amounts_found': amounts_found,
|
| 318 |
+
'dates_found': dates_found
|
| 319 |
}
|
| 320 |
+
return validated_results
|
| 321 |
+
|
| 322 |
+
validated_sources = validate_results({
|
| 323 |
+
'source_1': source_1,
|
| 324 |
+
'source_3': source_3,
|
| 325 |
+
'source_4': source_4
|
| 326 |
+
})
|
| 327 |
+
|
| 328 |
+
return JSONResponse({
|
| 329 |
+
"sources": validated_sources
|
| 330 |
})
|
| 331 |
|
| 332 |
except Exception as e:
|
|
|
|
| 333 |
return JSONResponse(
|
| 334 |
status_code=500,
|
| 335 |
content={"error": str(e)}
|
| 336 |
)
|
| 337 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
|
| 339 |
# Initialize models
|
| 340 |
model_init = init_model()
|