pdf_export_tg_bot

Runtime error

App Files Files Community

dmitrynovikov7211 commited on Jan 31

Commit

c359014

verified ·

1 Parent(s): e9d3c0c

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -83

app.py CHANGED Viewed

@@ -274,79 +274,94 @@ async def process_document(
         source_3 = extract_text_pdfminer(temp_path)
         # Source 4: PyMuPDF specialized для таблиц
-       # Source 4: PyMuPDF специально для таблиц
-       def extract_text_pymupdf(pdf_path):
-           try:
-               doc = fitz.open(pdf_path)
-               text = []
-               table_text = []
-               for page_num in range(min(end_pages, doc.page_count)):
-                   page = doc[page_num]
-                   page_dict = page.get_text("dict", sort=True)
-                   in_table = False
-                   current_table = []
-                   for block in page_dict.get("blocks", []):
-                       if block["type"] == 0:  # Текстовый блок
-                           block_text = ""
-                           # Проверяем, похож ли блок на часть таблицы
-                           is_table_like = False
-                           for line in block.get("lines", []):
-                               line_text = ""
-                               spans = line.get("spans", [])
-                               # Проверяем характеристики, типичные для таблиц
-                               if len(spans) > 3:  # Много колонок
-                                   is_table_like = True
-                               for span in spans:
-                                   span_text = span.get("text", "").encode('utf-8', errors='ignore').decode('utf-8')
-                                   # Ищем типичные разделители таблиц
-                                   if any(sep in span_text for sep in ["|", "\t", "│"]):
-                                       is_table_like = True
-                                   line_text += span_text + " "
-                               # Проверяем наличие цифр и дат - характерно для банковских таблиц
-                               if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line_text) or \
-                                  re.search(r'\d+[.,]\d{2}', line_text):
-                                   is_table_like = True
-                               if line_text.strip():
-                                   block_text += line_text.strip() + "\n"
-                           if is_table_like:
-                               if not in_table:
-                                   in_table = True
-                               current_table.append(block_text)
-                           else:
-                               if in_table:
-                                   # Закончилась таблица
-                                   table_text.append("\n".join(current_table))
-                                   current_table = []
-                                   in_table = False
-                               text.append(block_text)
-                       elif block["type"] == 1:  # Таблица/изображение
-                           if in_table:
-                               table_text.append("\n".join(current_table))
-                               current_table = []
-                               in_table = False
-                           table_text.append("<TABLE_DATA>")
-               doc.close()
-               return "\n".join(text), "\n".join(table_text)
-           except Exception as e:
-               logger.error(f"PyMuPDF extraction error: {str(e)}")
-               return str(e), ""
-       source_4_text, source_4_tables = extract_text_pymupdf(temp_path)
-        # Валидация для определения лучшего источника таблиц
         def validate_table_text(text):
             score = 0
@@ -363,8 +378,9 @@ async def process_document(
             score += len(codes) * 3
             # Проверка на кириллицу
-            if bool(re.search('[а-яА-Я]', text)):
-                score += 10
             # Проверка структуры таблицы
             if len(re.findall(r'\||\t|│', text)) > 5:  # Много разделителей
@@ -372,17 +388,30 @@ async def process_document(
             return score
-        # Проверяем все источники
         table_scores = {
             'magic': validate_table_text(source_1),
             'pdfminer': validate_table_text(source_3),
             'pymupdf': validate_table_text(source_4_tables)
         }
-        # Определяем лучший источник для таблиц
         best_table_source = max(table_scores.items(), key=lambda x: x[1])[0]
-        # Выбираем текст таблиц из лучшего источника
         table_text = {
             'magic': source_1,
             'pdfminer': source_3,
@@ -390,27 +419,33 @@ async def process_document(
         }[best_table_source]
         # Комбинируем результаты
-        combined_source = f"{source_1}\n\nTABLE_DATA_START\n{table_text}\nTABLE_DATA_END"
         # Возвращаем результаты со всеми исходниками для сравнения
         validated_sources = {
             'combined': {
                 'text': combined_source,
-                'confidence_score': max(table_scores.values()) / 20,
                 'table_source': best_table_source,
                 'table_scores': table_scores
             },
             'source_1': {
                 'text': source_1,
-                'score': table_scores['magic']
             },
             'source_3': {
                 'text': source_3,
-                'score': table_scores['pdfminer']
             },
             'source_4': {
-                'text': source_4_tables,
-                'score': table_scores['pymupdf']
             }
         }

         source_3 = extract_text_pdfminer(temp_path)
         # Source 4: PyMuPDF specialized для таблиц
+        def extract_text_pymupdf(pdf_path):
+            try:
+                doc = fitz.open(pdf_path)
+                text = []
+                table_text = []
+                for page_num in range(min(end_pages, doc.page_count)):
+                    page = doc[page_num]
+                    page_dict = page.get_text("dict", sort=True)
+                    in_table = False
+                    current_table = []
+                    for block in page_dict.get("blocks", []):
+                        if block["type"] == 0:  # Текстовый блок
+                            block_text = ""
+                            # Проверяем, похож ли блок на часть таблицы
+                            is_table_like = False
+                            for line in block.get("lines", []):
+                                line_text = ""
+                                spans = line.get("spans", [])
+                                # Проверяем характеристики, типичные для таблиц
+                                if len(spans) > 3:  # Много колонок
+                                    is_table_like = True
+                                for span in spans:
+                                    # Принудительно декодируем в UTF-8
+                                    span_text = span.get("text", "").encode('utf-8', errors='ignore').decode('utf-8')
+                                    if any(sep in span_text for sep in ["|", "\t", "│"]):
+                                        is_table_like = True
+                                    line_text += span_text + " "
+                                # Проверяем на банковские данные
+                                if (re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line_text) or  # Даты
+                                    re.search(r'\d+[.,]\d{2}', line_text) or              # Суммы
+                                    re.search(r'[A-Z]{2}\d{2}[A-Z0-9]{4,}', line_text)):  # Коды
+                                    is_table_like = True
+                                if line_text.strip():
+                                    block_text += line_text.strip() + "\n"
+                            if is_table_like:
+                                if not in_table:
+                                    in_table = True
+                                current_table.append(block_text)
+                            else:
+                                if in_table:
+                                    table_text.append("\n".join(current_table))
+                                    current_table = []
+                                    in_table = False
+                                text.append(block_text)
+                        elif block["type"] == 1:  # Таблица/изображение
+                            if in_table:
+                                table_text.append("\n".join(current_table))
+                                current_table = []
+                                in_table = False
+                            table_text.append("<TABLE_DATA>")
+                doc.close()
+                return "\n".join(text), "\n".join(table_text)
+            except Exception as e:
+                logger.error(f"PyMuPDF extraction error: {str(e)}")
+                return str(e), ""
+        source_4_text, source_4_tables = extract_text_pymupdf(temp_path)
+        def validate_text_quality(text):
+            score = 0
+            # Базовые проверки текста
+            if not text or len(text) < 10:
+                return 0
+            # Проверка кириллицы
+            cyrillic = re.findall(r'[а-яА-Я]+', text)
+            if cyrillic:
+                score += len(cyrillic)
+            # Проверка банковских терминов
+            bank_terms = ['банк', 'счет', 'платеж', 'сумма', 'кредит', 'дебет', 'баланс']
+            score += sum(10 for term in bank_terms if term in text.lower())
+            return score
         def validate_table_text(text):
             score = 0
             score += len(codes) * 3
             # Проверка на кириллицу
+            cyrillic = re.findall(r'[а-яА-Я]+', text)
+            if cyrillic:
+                score += len(cyrillic)
             # Проверка структуры таблицы
             if len(re.findall(r'\||\t|│', text)) > 5:  # Много разделителей
             return score
+        # Оцениваем все источники
+        text_scores = {
+            'magic': validate_text_quality(source_1),
+            'pdfminer': validate_text_quality(source_3),
+            'pymupdf': validate_text_quality(source_4_text)
+        }
         table_scores = {
             'magic': validate_table_text(source_1),
             'pdfminer': validate_table_text(source_3),
             'pymupdf': validate_table_text(source_4_tables)
         }
+        # Определяем лучшие источники
+        best_text_source = max(text_scores.items(), key=lambda x: x[1])[0]
         best_table_source = max(table_scores.items(), key=lambda x: x[1])[0]
+        # Выбираем текст из лучших источников
+        main_text = {
+            'magic': source_1,
+            'pdfminer': source_3,
+            'pymupdf': source_4_text
+        }[best_text_source]
         table_text = {
             'magic': source_1,
             'pdfminer': source_3,
         }[best_table_source]
         # Комбинируем результаты
+        combined_source = f"{main_text}\n\nTABLE_DATA_START\n{table_text}\nTABLE_DATA_END"
         # Возвращаем результаты со всеми исходниками для сравнения
         validated_sources = {
             'combined': {
                 'text': combined_source,
+                'confidence_score': (max(text_scores.values()) + max(table_scores.values())) / 40,
+                'text_source': best_text_source,
                 'table_source': best_table_source,
+                'text_scores': text_scores,
                 'table_scores': table_scores
             },
             'source_1': {
                 'text': source_1,
+                'text_score': text_scores['magic'],
+                'table_score': table_scores['magic']
             },
             'source_3': {
                 'text': source_3,
+                'text_score': text_scores['pdfminer'],
+                'table_score': table_scores['pdfminer']
             },
             'source_4': {
+                'text': source_4_text,
+                'tables': source_4_tables,
+                'text_score': text_scores['pymupdf'],
+                'table_score': table_scores['pymupdf']
             }
         }