pdf_export_tg_bot

Runtime error

App Files Files Community

dmitrynovikov7211 commited on Jan 31

Commit

5832897

verified ·

1 Parent(s): c359014

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -130

app.py CHANGED Viewed

@@ -256,24 +256,7 @@ async def process_document(
         )
         source_1 = txt_content
-        # Source 3: PDFMiner
-        def extract_text_pdfminer(pdf_path):
-            try:
-                laparams = LAParams(
-                    line_margin=0.5,
-                    word_margin=0.1,
-                    char_margin=2.0,
-                    boxes_flow=0.5,
-                    detect_vertical=True
-                )
-                text = extract_text(pdf_path, laparams=laparams)
-                return text
-            except Exception as e:
-                return str(e)
-        source_3 = extract_text_pdfminer(temp_path)
-        # Source 4: PyMuPDF specialized для таблиц
         def extract_text_pymupdf(pdf_path):
             try:
                 doc = fitz.open(pdf_path)
@@ -290,30 +273,26 @@ async def process_document(
                     for block in page_dict.get("blocks", []):
                         if block["type"] == 0:  # Текстовый блок
                             block_text = ""
-                            # Проверяем, похож ли блок на часть таблицы
                             is_table_like = False
                             for line in block.get("lines", []):
                                 line_text = ""
                                 spans = line.get("spans", [])
-                                # Проверяем характеристики, типичные для таблиц
-                                if len(spans) > 3:  # Много колонок
                                     is_table_like = True
                                 for span in spans:
-                                    # Принудительно декодируем в UTF-8
                                     span_text = span.get("text", "").encode('utf-8', errors='ignore').decode('utf-8')
                                     if any(sep in span_text for sep in ["|", "\t", "│"]):
                                         is_table_like = True
                                     line_text += span_text + " "
-                                # Проверяем на банковские данные
-                                if (re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line_text) or  # Даты
-                                    re.search(r'\d+[.,]\d{2}', line_text) or              # Суммы
-                                    re.search(r'[A-Z]{2}\d{2}[A-Z0-9]{4,}', line_text)):  # Коды
                                     is_table_like = True
                                 if line_text.strip():
                                     block_text += line_text.strip() + "\n"
@@ -328,7 +307,7 @@ async def process_document(
                                     in_table = False
                                 text.append(block_text)
-                        elif block["type"] == 1:  # Таблица/изображение
                             if in_table:
                                 table_text.append("\n".join(current_table))
                                 current_table = []
@@ -344,108 +323,16 @@ async def process_document(
         source_4_text, source_4_tables = extract_text_pymupdf(temp_path)
-        def validate_text_quality(text):
-            score = 0
-            # Базовые проверки текста
-            if not text or len(text) < 10:
-                return 0
-            # Проверка кириллицы
-            cyrillic = re.findall(r'[а-яА-Я]+', text)
-            if cyrillic:
-                score += len(cyrillic)
-            # Проверка банковских терминов
-            bank_terms = ['банк', 'счет', 'платеж', 'сумма', 'кредит', 'дебет', 'баланс']
-            score += sum(10 for term in bank_terms if term in text.lower())
-            return score
-        def validate_table_text(text):
-            score = 0
-            # Проверка на наличие дат
-            dates = re.findall(r'\d{2}[./-]\d{2}[./-]\d{4}', text)
-            score += len(dates) * 2
-            # Проверка на наличие сумм
-            amounts = re.findall(r'\d+[.,]\d{2}', text)
-            score += len(amounts) * 2
-            # Проверка на наличие банковских кодов
-            codes = re.findall(r'[A-Z]{2}\d{2}[A-Z0-9]{4,}', text)
-            score += len(codes) * 3
-            # Проверка на кириллицу
-            cyrillic = re.findall(r'[а-яА-Я]+', text)
-            if cyrillic:
-                score += len(cyrillic)
-            # Проверка структуры таблицы
-            if len(re.findall(r'\||\t|│', text)) > 5:  # Много разделителей
-                score += 5
-            return score
-        # Оцениваем все источники
-        text_scores = {
-            'magic': validate_text_quality(source_1),
-            'pdfminer': validate_text_quality(source_3),
-            'pymupdf': validate_text_quality(source_4_text)
-        }
-        table_scores = {
-            'magic': validate_table_text(source_1),
-            'pdfminer': validate_table_text(source_3),
-            'pymupdf': validate_table_text(source_4_tables)
-        }
-        # Определяем лучшие источники
-        best_text_source = max(text_scores.items(), key=lambda x: x[1])[0]
-        best_table_source = max(table_scores.items(), key=lambda x: x[1])[0]
-        # Выбираем текст из лучших источников
-        main_text = {
-            'magic': source_1,
-            'pdfminer': source_3,
-            'pymupdf': source_4_text
-        }[best_text_source]
-        table_text = {
-            'magic': source_1,
-            'pdfminer': source_3,
-            'pymupdf': source_4_tables
-        }[best_table_source]
-        # Комбинируем результаты
-        combined_source = f"{main_text}\n\nTABLE_DATA_START\n{table_text}\nTABLE_DATA_END"
-        # Возвращаем результаты со всеми исходниками для сравнения
         validated_sources = {
-            'combined': {
-                'text': combined_source,
-                'confidence_score': (max(text_scores.values()) + max(table_scores.values())) / 40,
-                'text_source': best_text_source,
-                'table_source': best_table_source,
-                'text_scores': text_scores,
-                'table_scores': table_scores
-            },
-            'source_1': {
-                'text': source_1,
-                'text_score': text_scores['magic'],
-                'table_score': table_scores['magic']
-            },
-            'source_3': {
-                'text': source_3,
-                'text_score': text_scores['pdfminer'],
-                'table_score': table_scores['pdfminer']
-            },
-            'source_4': {
-                'text': source_4_text,
-                'tables': source_4_tables,
-                'text_score': text_scores['pymupdf'],
-                'table_score': table_scores['pymupdf']
             }
         }

         )
         source_1 = txt_content
+        # Source 4: PyMuPDF для таблиц
         def extract_text_pymupdf(pdf_path):
             try:
                 doc = fitz.open(pdf_path)
                     for block in page_dict.get("blocks", []):
                         if block["type"] == 0:  # Текстовый блок
                             block_text = ""
                             is_table_like = False
                             for line in block.get("lines", []):
                                 line_text = ""
                                 spans = line.get("spans", [])
+                                if len(spans) > 3:
                                     is_table_like = True
                                 for span in spans:
                                     span_text = span.get("text", "").encode('utf-8', errors='ignore').decode('utf-8')
                                     if any(sep in span_text for sep in ["|", "\t", "│"]):
                                         is_table_like = True
                                     line_text += span_text + " "
+                                if (re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line_text) or
+                                    re.search(r'\d+[.,]\d{2}', line_text) or
+                                    re.search(r'[A-Z]{2}\d{2}[A-Z0-9]{4,}', line_text)):
                                     is_table_like = True
                                 if line_text.strip():
                                     block_text += line_text.strip() + "\n"
                                     in_table = False
                                 text.append(block_text)
+                        elif block["type"] == 1:
                             if in_table:
                                 table_text.append("\n".join(current_table))
                                 current_table = []
         source_4_text, source_4_tables = extract_text_pymupdf(temp_path)
         validated_sources = {
+            "sources": {
+                "magic_pdf": {
+                    "text": source_1
+                },
+                "pymupdf": {
+                    "text": source_4_text,
+                    "tables": source_4_tables
+                },
+                "combined": f"{source_1}\n\n### MAGIC_PDF_DATA ###\n{source_1}\n\n### PYMUPDF_DATA ###\n{source_4_tables}"
             }
         }