pdf_export_tg_bot

Runtime error

App Files Files Community

dmitrynovikov7211 commited on Jan 31

Commit

f3f0948

verified ·

1 Parent(s): c91f495

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -79

app.py CHANGED Viewed

@@ -226,16 +226,14 @@ def to_pdf(file_path):
             return tmp_file_path
 @app.post("/process_document")
 async def process_document(
     file: UploadFile = File(...),
     end_pages: int = 10,
     is_ocr: bool = False,
-    layout_mode: str = "doclayout_yolo",
     formula_enable: bool = True,
-    table_enable: bool = True,
     language: str = "auto"
 ):
     try:
@@ -244,7 +242,7 @@ async def process_document(
             content = await file.read()
             buffer.write(content)
-        # Source 1: magic-pdf для основного текста
         md_content, txt_content, archive_zip_path, new_pdf_path = to_markdown(
             temp_path,
             end_pages=end_pages,
@@ -254,98 +252,89 @@ async def process_document(
             table_enable=table_enable,
             language=language
         )
         def extract_text_pymupdf(pdf_path):
             try:
                 doc = fitz.open(pdf_path)
-                text = []
-                table_text = []
                 for page_num in range(min(end_pages, doc.page_count)):
                     page = doc[page_num]
-                    page_dict = page.get_text("dict", sort=True)
-                    in_table = False
-                    current_table = []
-                    for block in page_dict.get("blocks", []):
-                        if block["type"] == 0:  # Текстовый блок
-                            block_text = ""
-                            is_table_like = False
-                            for line in block.get("lines", []):
-                                line_text = ""
-                                spans = line.get("spans", [])
-                                if len(spans) > 3:
-                                    is_table_like = True
-                                for span in spans:
-                                    span_text = span.get("text", "").encode('utf-8', errors='ignore').decode('utf-8')
-                                    if any(sep in span_text for sep in ["|", "\t", "│"]):
-                                        is_table_like = True
-                                    line_text += span_text + " "
-                                if (re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line_text) or
-                                    re.search(r'\d+[.,]\d{2}', line_text) or
-                                    re.search(r'[A-Z]{2}\d{2}[A-Z0-9]{4,}', line_text)):
-                                    is_table_like = True
-                                if line_text.strip():
-                                    block_text += line_text.strip() + "\n"
-                            if is_table_like:
-                                if not in_table:
-                                    in_table = True
-                                current_table.append(block_text)
-                            else:
-                                if in_table:
-                                    table_text.append("\n".join(current_table))
-                                    current_table = []
-                                    in_table = False
-                                text.append(block_text)
-                        elif block["type"] == 1:
-                            if in_table:
-                                table_text.append("\n".join(current_table))
-                                current_table = []
-                                in_table = False
-                            table_text.append("<TABLE_DATA>")
                 doc.close()
-                return "\n".join(text), "\n".join(table_text)
             except Exception as e:
-                logger.error(f"PyMuPDF extraction error: {str(e)}")
-                return str(e), ""
-        source_4_text, source_4_tables = extract_text_pymupdf(temp_path)
-        # Возвращаем в формате, который ожидает DeepSeek
-        # Возвращаем в том формате, который ожидает process_sources_generator
-        return JSONResponse({
-            "sources": {
-                "magic_pdf": {
-                    "text": txt_content
-                },
-                "pymupdf": {
-                    "text": f"{source_4_text}\n{source_4_tables}"
                 }
-            }
         })
     except Exception as e:
-        logger.error(f"Process document error: {str(e)}", exc_info=True)
         return JSONResponse(
             status_code=500,
             content={"error": str(e)}
         )
-    finally:
-        try:
-            if os.path.exists(temp_path):
-                os.remove(temp_path)
-        except Exception as e:
-            logger.error(f"Cleanup error: {str(e)}")
 # Initialize models
 model_init = init_model()

             return tmp_file_path
 @app.post("/process_document")
 async def process_document(
     file: UploadFile = File(...),
     end_pages: int = 10,
     is_ocr: bool = False,
+    layout_mode: str = "doclayout_yolo",
     formula_enable: bool = True,
+    table_enable: bool = False,
     language: str = "auto"
 ):
     try:
             content = await file.read()
             buffer.write(content)
+        # Source 1: magic-pdf processing
         md_content, txt_content, archive_zip_path, new_pdf_path = to_markdown(
             temp_path,
             end_pages=end_pages,
             table_enable=table_enable,
             language=language
         )
+        source_1 = txt_content
+        # Source 3: PDFMiner
+        def extract_text_pdfminer(pdf_path):
+            try:
+                laparams = LAParams(
+                    line_margin=0.5,
+                    word_margin=0.1,
+                    char_margin=2.0,
+                    boxes_flow=0.5,
+                    detect_vertical=True
+                )
+                text = extract_text(pdf_path, laparams=laparams)
+                return text
+            except Exception as e:
+                return str(e)
+        source_3 = extract_text_pdfminer(temp_path)
+        # Source 4: PyMuPDF (more precise for tables and structured content)
         def extract_text_pymupdf(pdf_path):
             try:
                 doc = fitz.open(pdf_path)
+                text = ""
                 for page_num in range(min(end_pages, doc.page_count)):
                     page = doc[page_num]
+                    # Extract text with preserved formatting
+                    blocks = page.get_text("blocks")
+                    # Sort blocks by vertical position then horizontal
+                    blocks.sort(key=lambda b: (b[1], b[0]))
+                    for b in blocks:
+                        text += b[4] + "\n"
                 doc.close()
+                return text
             except Exception as e:
+                return str(e)
+        source_4 = extract_text_pymupdf(temp_path)
+        # Clean up
+        os.remove(temp_path)
+        # Compare and validate results
+        def validate_results(sources):
+            # Basic validation checks
+            validated_results = {}
+            for idx, source in sources.items():
+                # Check for common banking keywords
+                banking_keywords = ['balance', 'deposit', 'withdrawal', 'transaction', 'account']
+                keyword_presence = sum(1 for keyword in banking_keywords if keyword.lower() in source.lower())
+                # Check for number patterns (amounts)
+                amount_pattern = r'\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
+                amounts_found = len(re.findall(amount_pattern, source))
+                # Check for date patterns
+                date_pattern = r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}'
+                dates_found = len(re.findall(date_pattern, source))
+                validated_results[idx] = {
+                    'text': source,
+                    'confidence_score': (keyword_presence + amounts_found + dates_found) / 10,
+                    'amounts_found': amounts_found,
+                    'dates_found': dates_found
                 }
+            return validated_results
+        validated_sources = validate_results({
+            'source_1': source_1,
+            'source_3': source_3,
+            'source_4': source_4
+        })
+        return JSONResponse({
+            "sources": validated_sources
         })
     except Exception as e:
         return JSONResponse(
             status_code=500,
             content={"error": str(e)}
         )
 # Initialize models
 model_init = init_model()