Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -274,79 +274,94 @@ async def process_document(
|
|
274 |
source_3 = extract_text_pdfminer(temp_path)
|
275 |
|
276 |
# Source 4: PyMuPDF specialized для таблиц
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
|
278 |
-
# Source 4: PyMuPDF специально для таблиц
|
279 |
-
def extract_text_pymupdf(pdf_path):
|
280 |
-
try:
|
281 |
-
doc = fitz.open(pdf_path)
|
282 |
-
text = []
|
283 |
-
table_text = []
|
284 |
-
|
285 |
-
for page_num in range(min(end_pages, doc.page_count)):
|
286 |
-
page = doc[page_num]
|
287 |
-
page_dict = page.get_text("dict", sort=True)
|
288 |
-
|
289 |
-
in_table = False
|
290 |
-
current_table = []
|
291 |
-
|
292 |
-
for block in page_dict.get("blocks", []):
|
293 |
-
if block["type"] == 0: # Текстовый блок
|
294 |
-
block_text = ""
|
295 |
-
# Проверяем, похож ли блок на часть таблицы
|
296 |
-
is_table_like = False
|
297 |
-
|
298 |
-
for line in block.get("lines", []):
|
299 |
-
line_text = ""
|
300 |
-
spans = line.get("spans", [])
|
301 |
-
|
302 |
-
# Проверяем характеристики, типичные для таблиц
|
303 |
-
if len(spans) > 3: # Много колонок
|
304 |
-
is_table_like = True
|
305 |
-
|
306 |
-
for span in spans:
|
307 |
-
span_text = span.get("text", "").encode('utf-8', errors='ignore').decode('utf-8')
|
308 |
-
# Ищем типичные разделители таблиц
|
309 |
-
if any(sep in span_text for sep in ["|", "\t", "│"]):
|
310 |
-
is_table_like = True
|
311 |
-
line_text += span_text + " "
|
312 |
-
|
313 |
-
# Проверяем наличие цифр и дат - характерно для банковских таблиц
|
314 |
-
if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line_text) or \
|
315 |
-
re.search(r'\d+[.,]\d{2}', line_text):
|
316 |
-
is_table_like = True
|
317 |
-
|
318 |
-
if line_text.strip():
|
319 |
-
block_text += line_text.strip() + "\n"
|
320 |
-
|
321 |
-
if is_table_like:
|
322 |
-
if not in_table:
|
323 |
-
in_table = True
|
324 |
-
current_table.append(block_text)
|
325 |
-
else:
|
326 |
-
if in_table:
|
327 |
-
# Закончилась таблица
|
328 |
-
table_text.append("\n".join(current_table))
|
329 |
-
current_table = []
|
330 |
-
in_table = False
|
331 |
-
text.append(block_text)
|
332 |
-
|
333 |
-
elif block["type"] == 1: # Таблица/изображение
|
334 |
-
if in_table:
|
335 |
-
table_text.append("\n".join(current_table))
|
336 |
-
current_table = []
|
337 |
-
in_table = False
|
338 |
-
table_text.append("<TABLE_DATA>")
|
339 |
-
|
340 |
-
doc.close()
|
341 |
-
return "\n".join(text), "\n".join(table_text)
|
342 |
-
|
343 |
-
except Exception as e:
|
344 |
-
logger.error(f"PyMuPDF extraction error: {str(e)}")
|
345 |
-
return str(e), ""
|
346 |
-
|
347 |
-
source_4_text, source_4_tables = extract_text_pymupdf(temp_path)
|
348 |
-
|
349 |
-
# Валидация для определения лучшего источника таблиц
|
350 |
def validate_table_text(text):
|
351 |
score = 0
|
352 |
|
@@ -363,8 +378,9 @@ async def process_document(
|
|
363 |
score += len(codes) * 3
|
364 |
|
365 |
# Проверка на кириллицу
|
366 |
-
|
367 |
-
|
|
|
368 |
|
369 |
# Проверка структуры таблицы
|
370 |
if len(re.findall(r'\||\t|│', text)) > 5: # Много разделителей
|
@@ -372,17 +388,30 @@ async def process_document(
|
|
372 |
|
373 |
return score
|
374 |
|
375 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
376 |
table_scores = {
|
377 |
'magic': validate_table_text(source_1),
|
378 |
'pdfminer': validate_table_text(source_3),
|
379 |
'pymupdf': validate_table_text(source_4_tables)
|
380 |
}
|
381 |
|
382 |
-
# Определяем
|
|
|
383 |
best_table_source = max(table_scores.items(), key=lambda x: x[1])[0]
|
384 |
|
385 |
-
# Выбираем текст
|
|
|
|
|
|
|
|
|
|
|
|
|
386 |
table_text = {
|
387 |
'magic': source_1,
|
388 |
'pdfminer': source_3,
|
@@ -390,27 +419,33 @@ async def process_document(
|
|
390 |
}[best_table_source]
|
391 |
|
392 |
# Комбинируем результаты
|
393 |
-
combined_source = f"{
|
394 |
|
395 |
# Возвращаем результаты со всеми исходниками для сравнения
|
396 |
validated_sources = {
|
397 |
'combined': {
|
398 |
'text': combined_source,
|
399 |
-
'confidence_score': max(table_scores.values()) /
|
|
|
400 |
'table_source': best_table_source,
|
|
|
401 |
'table_scores': table_scores
|
402 |
},
|
403 |
'source_1': {
|
404 |
'text': source_1,
|
405 |
-
'
|
|
|
406 |
},
|
407 |
'source_3': {
|
408 |
'text': source_3,
|
409 |
-
'
|
|
|
410 |
},
|
411 |
'source_4': {
|
412 |
-
'text':
|
413 |
-
'
|
|
|
|
|
414 |
}
|
415 |
}
|
416 |
|
|
|
274 |
source_3 = extract_text_pdfminer(temp_path)
|
275 |
|
276 |
# Source 4: PyMuPDF specialized для таблиц
|
277 |
+
def extract_text_pymupdf(pdf_path):
|
278 |
+
try:
|
279 |
+
doc = fitz.open(pdf_path)
|
280 |
+
text = []
|
281 |
+
table_text = []
|
282 |
+
|
283 |
+
for page_num in range(min(end_pages, doc.page_count)):
|
284 |
+
page = doc[page_num]
|
285 |
+
page_dict = page.get_text("dict", sort=True)
|
286 |
+
|
287 |
+
in_table = False
|
288 |
+
current_table = []
|
289 |
+
|
290 |
+
for block in page_dict.get("blocks", []):
|
291 |
+
if block["type"] == 0: # Текстовый блок
|
292 |
+
block_text = ""
|
293 |
+
# Проверяем, похож ли блок на часть таблицы
|
294 |
+
is_table_like = False
|
295 |
+
|
296 |
+
for line in block.get("lines", []):
|
297 |
+
line_text = ""
|
298 |
+
spans = line.get("spans", [])
|
299 |
+
|
300 |
+
# Проверяем характеристики, типичные для таблиц
|
301 |
+
if len(spans) > 3: # Много колонок
|
302 |
+
is_table_like = True
|
303 |
+
|
304 |
+
for span in spans:
|
305 |
+
# Принудительно декодируем в UTF-8
|
306 |
+
span_text = span.get("text", "").encode('utf-8', errors='ignore').decode('utf-8')
|
307 |
+
if any(sep in span_text for sep in ["|", "\t", "│"]):
|
308 |
+
is_table_like = True
|
309 |
+
line_text += span_text + " "
|
310 |
+
|
311 |
+
# Проверяем на банковские данные
|
312 |
+
if (re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line_text) or # Даты
|
313 |
+
re.search(r'\d+[.,]\d{2}', line_text) or # Суммы
|
314 |
+
re.search(r'[A-Z]{2}\d{2}[A-Z0-9]{4,}', line_text)): # Коды
|
315 |
+
is_table_like = True
|
316 |
+
|
317 |
+
if line_text.strip():
|
318 |
+
block_text += line_text.strip() + "\n"
|
319 |
+
|
320 |
+
if is_table_like:
|
321 |
+
if not in_table:
|
322 |
+
in_table = True
|
323 |
+
current_table.append(block_text)
|
324 |
+
else:
|
325 |
+
if in_table:
|
326 |
+
table_text.append("\n".join(current_table))
|
327 |
+
current_table = []
|
328 |
+
in_table = False
|
329 |
+
text.append(block_text)
|
330 |
+
|
331 |
+
elif block["type"] == 1: # Таблица/изображение
|
332 |
+
if in_table:
|
333 |
+
table_text.append("\n".join(current_table))
|
334 |
+
current_table = []
|
335 |
+
in_table = False
|
336 |
+
table_text.append("<TABLE_DATA>")
|
337 |
+
|
338 |
+
doc.close()
|
339 |
+
return "\n".join(text), "\n".join(table_text)
|
340 |
+
|
341 |
+
except Exception as e:
|
342 |
+
logger.error(f"PyMuPDF extraction error: {str(e)}")
|
343 |
+
return str(e), ""
|
344 |
+
|
345 |
+
source_4_text, source_4_tables = extract_text_pymupdf(temp_path)
|
346 |
+
|
347 |
+
def validate_text_quality(text):
|
348 |
+
score = 0
|
349 |
+
|
350 |
+
# Базовые проверки текста
|
351 |
+
if not text or len(text) < 10:
|
352 |
+
return 0
|
353 |
+
|
354 |
+
# Проверка кириллицы
|
355 |
+
cyrillic = re.findall(r'[а-яА-Я]+', text)
|
356 |
+
if cyrillic:
|
357 |
+
score += len(cyrillic)
|
358 |
+
|
359 |
+
# Проверка банковских терминов
|
360 |
+
bank_terms = ['банк', 'счет', 'платеж', 'сумма', 'кредит', 'дебет', 'баланс']
|
361 |
+
score += sum(10 for term in bank_terms if term in text.lower())
|
362 |
+
|
363 |
+
return score
|
364 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
def validate_table_text(text):
|
366 |
score = 0
|
367 |
|
|
|
378 |
score += len(codes) * 3
|
379 |
|
380 |
# Проверка на кириллицу
|
381 |
+
cyrillic = re.findall(r'[а-яА-Я]+', text)
|
382 |
+
if cyrillic:
|
383 |
+
score += len(cyrillic)
|
384 |
|
385 |
# Проверка структуры таблицы
|
386 |
if len(re.findall(r'\||\t|│', text)) > 5: # Много разделителей
|
|
|
388 |
|
389 |
return score
|
390 |
|
391 |
+
# Оцениваем все источники
|
392 |
+
text_scores = {
|
393 |
+
'magic': validate_text_quality(source_1),
|
394 |
+
'pdfminer': validate_text_quality(source_3),
|
395 |
+
'pymupdf': validate_text_quality(source_4_text)
|
396 |
+
}
|
397 |
+
|
398 |
table_scores = {
|
399 |
'magic': validate_table_text(source_1),
|
400 |
'pdfminer': validate_table_text(source_3),
|
401 |
'pymupdf': validate_table_text(source_4_tables)
|
402 |
}
|
403 |
|
404 |
+
# Определяем лучшие источники
|
405 |
+
best_text_source = max(text_scores.items(), key=lambda x: x[1])[0]
|
406 |
best_table_source = max(table_scores.items(), key=lambda x: x[1])[0]
|
407 |
|
408 |
+
# Выбираем текст из лучших источников
|
409 |
+
main_text = {
|
410 |
+
'magic': source_1,
|
411 |
+
'pdfminer': source_3,
|
412 |
+
'pymupdf': source_4_text
|
413 |
+
}[best_text_source]
|
414 |
+
|
415 |
table_text = {
|
416 |
'magic': source_1,
|
417 |
'pdfminer': source_3,
|
|
|
419 |
}[best_table_source]
|
420 |
|
421 |
# Комбинируем результаты
|
422 |
+
combined_source = f"{main_text}\n\nTABLE_DATA_START\n{table_text}\nTABLE_DATA_END"
|
423 |
|
424 |
# Возвращаем результаты со всеми исходниками для сравнения
|
425 |
validated_sources = {
|
426 |
'combined': {
|
427 |
'text': combined_source,
|
428 |
+
'confidence_score': (max(text_scores.values()) + max(table_scores.values())) / 40,
|
429 |
+
'text_source': best_text_source,
|
430 |
'table_source': best_table_source,
|
431 |
+
'text_scores': text_scores,
|
432 |
'table_scores': table_scores
|
433 |
},
|
434 |
'source_1': {
|
435 |
'text': source_1,
|
436 |
+
'text_score': text_scores['magic'],
|
437 |
+
'table_score': table_scores['magic']
|
438 |
},
|
439 |
'source_3': {
|
440 |
'text': source_3,
|
441 |
+
'text_score': text_scores['pdfminer'],
|
442 |
+
'table_score': table_scores['pdfminer']
|
443 |
},
|
444 |
'source_4': {
|
445 |
+
'text': source_4_text,
|
446 |
+
'tables': source_4_tables,
|
447 |
+
'text_score': text_scores['pymupdf'],
|
448 |
+
'table_score': table_scores['pymupdf']
|
449 |
}
|
450 |
}
|
451 |
|