Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -256,24 +256,7 @@ async def process_document(
|
|
256 |
)
|
257 |
source_1 = txt_content
|
258 |
|
259 |
-
# Source
|
260 |
-
def extract_text_pdfminer(pdf_path):
|
261 |
-
try:
|
262 |
-
laparams = LAParams(
|
263 |
-
line_margin=0.5,
|
264 |
-
word_margin=0.1,
|
265 |
-
char_margin=2.0,
|
266 |
-
boxes_flow=0.5,
|
267 |
-
detect_vertical=True
|
268 |
-
)
|
269 |
-
text = extract_text(pdf_path, laparams=laparams)
|
270 |
-
return text
|
271 |
-
except Exception as e:
|
272 |
-
return str(e)
|
273 |
-
|
274 |
-
source_3 = extract_text_pdfminer(temp_path)
|
275 |
-
|
276 |
-
# Source 4: PyMuPDF specialized для таблиц
|
277 |
def extract_text_pymupdf(pdf_path):
|
278 |
try:
|
279 |
doc = fitz.open(pdf_path)
|
@@ -290,30 +273,26 @@ async def process_document(
|
|
290 |
for block in page_dict.get("blocks", []):
|
291 |
if block["type"] == 0: # Текстовый блок
|
292 |
block_text = ""
|
293 |
-
# Проверяем, похож ли блок на часть таблицы
|
294 |
is_table_like = False
|
295 |
|
296 |
for line in block.get("lines", []):
|
297 |
line_text = ""
|
298 |
spans = line.get("spans", [])
|
299 |
|
300 |
-
|
301 |
-
if len(spans) > 3: # Много колонок
|
302 |
is_table_like = True
|
303 |
|
304 |
for span in spans:
|
305 |
-
# Принудительно декодируем в UTF-8
|
306 |
span_text = span.get("text", "").encode('utf-8', errors='ignore').decode('utf-8')
|
307 |
if any(sep in span_text for sep in ["|", "\t", "│"]):
|
308 |
is_table_like = True
|
309 |
line_text += span_text + " "
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
re.search(r'
|
314 |
-
re.search(r'[A-Z]{2}\d{2}[A-Z0-9]{4,}', line_text)): # Коды
|
315 |
is_table_like = True
|
316 |
-
|
317 |
if line_text.strip():
|
318 |
block_text += line_text.strip() + "\n"
|
319 |
|
@@ -328,7 +307,7 @@ async def process_document(
|
|
328 |
in_table = False
|
329 |
text.append(block_text)
|
330 |
|
331 |
-
elif block["type"] == 1:
|
332 |
if in_table:
|
333 |
table_text.append("\n".join(current_table))
|
334 |
current_table = []
|
@@ -344,108 +323,16 @@ async def process_document(
|
|
344 |
|
345 |
source_4_text, source_4_tables = extract_text_pymupdf(temp_path)
|
346 |
|
347 |
-
def validate_text_quality(text):
|
348 |
-
score = 0
|
349 |
-
|
350 |
-
# Базовые проверки текста
|
351 |
-
if not text or len(text) < 10:
|
352 |
-
return 0
|
353 |
-
|
354 |
-
# Проверка кириллицы
|
355 |
-
cyrillic = re.findall(r'[а-яА-Я]+', text)
|
356 |
-
if cyrillic:
|
357 |
-
score += len(cyrillic)
|
358 |
-
|
359 |
-
# Проверка банковских терминов
|
360 |
-
bank_terms = ['банк', 'счет', 'платеж', 'сумма', 'кредит', 'дебет', 'баланс']
|
361 |
-
score += sum(10 for term in bank_terms if term in text.lower())
|
362 |
-
|
363 |
-
return score
|
364 |
-
|
365 |
-
def validate_table_text(text):
|
366 |
-
score = 0
|
367 |
-
|
368 |
-
# Проверка на наличие дат
|
369 |
-
dates = re.findall(r'\d{2}[./-]\d{2}[./-]\d{4}', text)
|
370 |
-
score += len(dates) * 2
|
371 |
-
|
372 |
-
# Проверка на наличие сумм
|
373 |
-
amounts = re.findall(r'\d+[.,]\d{2}', text)
|
374 |
-
score += len(amounts) * 2
|
375 |
-
|
376 |
-
# Проверка на наличие банковских кодов
|
377 |
-
codes = re.findall(r'[A-Z]{2}\d{2}[A-Z0-9]{4,}', text)
|
378 |
-
score += len(codes) * 3
|
379 |
-
|
380 |
-
# Проверка на кириллицу
|
381 |
-
cyrillic = re.findall(r'[а-яА-Я]+', text)
|
382 |
-
if cyrillic:
|
383 |
-
score += len(cyrillic)
|
384 |
-
|
385 |
-
# Проверка структуры таблицы
|
386 |
-
if len(re.findall(r'\||\t|│', text)) > 5: # Много разделителей
|
387 |
-
score += 5
|
388 |
-
|
389 |
-
return score
|
390 |
-
|
391 |
-
# Оцениваем все источники
|
392 |
-
text_scores = {
|
393 |
-
'magic': validate_text_quality(source_1),
|
394 |
-
'pdfminer': validate_text_quality(source_3),
|
395 |
-
'pymupdf': validate_text_quality(source_4_text)
|
396 |
-
}
|
397 |
-
|
398 |
-
table_scores = {
|
399 |
-
'magic': validate_table_text(source_1),
|
400 |
-
'pdfminer': validate_table_text(source_3),
|
401 |
-
'pymupdf': validate_table_text(source_4_tables)
|
402 |
-
}
|
403 |
-
|
404 |
-
# Определяем лучшие источники
|
405 |
-
best_text_source = max(text_scores.items(), key=lambda x: x[1])[0]
|
406 |
-
best_table_source = max(table_scores.items(), key=lambda x: x[1])[0]
|
407 |
-
|
408 |
-
# Выбираем текст из лучших источников
|
409 |
-
main_text = {
|
410 |
-
'magic': source_1,
|
411 |
-
'pdfminer': source_3,
|
412 |
-
'pymupdf': source_4_text
|
413 |
-
}[best_text_source]
|
414 |
-
|
415 |
-
table_text = {
|
416 |
-
'magic': source_1,
|
417 |
-
'pdfminer': source_3,
|
418 |
-
'pymupdf': source_4_tables
|
419 |
-
}[best_table_source]
|
420 |
-
|
421 |
-
# Комбинируем результаты
|
422 |
-
combined_source = f"{main_text}\n\nTABLE_DATA_START\n{table_text}\nTABLE_DATA_END"
|
423 |
-
|
424 |
-
# Возвращаем результаты со всеми исходниками для сравнения
|
425 |
validated_sources = {
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
'text': source_1,
|
436 |
-
'text_score': text_scores['magic'],
|
437 |
-
'table_score': table_scores['magic']
|
438 |
-
},
|
439 |
-
'source_3': {
|
440 |
-
'text': source_3,
|
441 |
-
'text_score': text_scores['pdfminer'],
|
442 |
-
'table_score': table_scores['pdfminer']
|
443 |
-
},
|
444 |
-
'source_4': {
|
445 |
-
'text': source_4_text,
|
446 |
-
'tables': source_4_tables,
|
447 |
-
'text_score': text_scores['pymupdf'],
|
448 |
-
'table_score': table_scores['pymupdf']
|
449 |
}
|
450 |
}
|
451 |
|
|
|
256 |
)
|
257 |
source_1 = txt_content
|
258 |
|
259 |
+
# Source 4: PyMuPDF для таблиц
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
def extract_text_pymupdf(pdf_path):
|
261 |
try:
|
262 |
doc = fitz.open(pdf_path)
|
|
|
273 |
for block in page_dict.get("blocks", []):
|
274 |
if block["type"] == 0: # Текстовый блок
|
275 |
block_text = ""
|
|
|
276 |
is_table_like = False
|
277 |
|
278 |
for line in block.get("lines", []):
|
279 |
line_text = ""
|
280 |
spans = line.get("spans", [])
|
281 |
|
282 |
+
if len(spans) > 3:
|
|
|
283 |
is_table_like = True
|
284 |
|
285 |
for span in spans:
|
|
|
286 |
span_text = span.get("text", "").encode('utf-8', errors='ignore').decode('utf-8')
|
287 |
if any(sep in span_text for sep in ["|", "\t", "│"]):
|
288 |
is_table_like = True
|
289 |
line_text += span_text + " "
|
290 |
+
|
291 |
+
if (re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line_text) or
|
292 |
+
re.search(r'\d+[.,]\d{2}', line_text) or
|
293 |
+
re.search(r'[A-Z]{2}\d{2}[A-Z0-9]{4,}', line_text)):
|
|
|
294 |
is_table_like = True
|
295 |
+
|
296 |
if line_text.strip():
|
297 |
block_text += line_text.strip() + "\n"
|
298 |
|
|
|
307 |
in_table = False
|
308 |
text.append(block_text)
|
309 |
|
310 |
+
elif block["type"] == 1:
|
311 |
if in_table:
|
312 |
table_text.append("\n".join(current_table))
|
313 |
current_table = []
|
|
|
323 |
|
324 |
source_4_text, source_4_tables = extract_text_pymupdf(temp_path)
|
325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
validated_sources = {
|
327 |
+
"sources": {
|
328 |
+
"magic_pdf": {
|
329 |
+
"text": source_1
|
330 |
+
},
|
331 |
+
"pymupdf": {
|
332 |
+
"text": source_4_text,
|
333 |
+
"tables": source_4_tables
|
334 |
+
},
|
335 |
+
"combined": f"{source_1}\n\n### MAGIC_PDF_DATA ###\n{source_1}\n\n### PYMUPDF_DATA ###\n{source_4_tables}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
}
|
337 |
}
|
338 |
|