File size: 5,833 Bytes
7dec78f
 
e92fbe1
7dec78f
17d36dc
 
 
b8d5f22
6544d14
7cb3598
6a30f2e
41a1dac
b8d5f22
28f23fa
 
e2fb9c7
28f23fa
 
 
 
 
e2fb9c7
 
28f23fa
 
 
41a1dac
 
 
 
 
 
 
 
 
 
 
 
 
 
b8d5f22
7cb3598
 
 
 
 
 
 
 
 
 
 
 
b8d5f22
 
 
 
 
 
875f540
b8d5f22
 
 
 
e2fb9c7
28f23fa
b8d5f22
41a1dac
 
6544d14
 
 
e17150e
 
 
 
 
 
 
 
 
 
 
 
 
b8d5f22
ce01472
3403d47
 
41a1dac
3403d47
 
 
ce01472
3403d47
41a1dac
3403d47
 
 
 
 
 
 
 
ce01472
3403d47
41a1dac
3403d47
 
 
 
 
 
 
 
ce01472
7cb3598
6a30f2e
41a1dac
b8d5f22
 
6a30f2e
b8d5f22
875f540
7dec78f
 
 
3403d47
7dec78f
 
 
 
 
 
 
 
5e94ef1
e2fb9c7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import json
import gradio as gr
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
import os
import io
from PIL import Image
import pandas as pd
import pdfplumber
import tempfile
import traceback
import re

def save_image(element, images):
    try:
        if hasattr(element, 'stream') and element.stream:
            image_data = element.stream.get_rawdata()
            image = Image.open(io.BytesIO(image_data))
            image_filename = f"extracted_image_{len(images)}.png"
            image.save(image_filename)
            images.append({"filename": image_filename})
        else:
            print("No stream data for image element")
    except Exception as e:
        print(f"Error extracting image: {e}")

def detect_headers(text):
    """Detect headers in the text and format them."""
    lines = text.split('\n')
    formatted_text = ""
    header_patterns = [r"^\d+\.\s", r"^[A-Z\s]+$", r"^[A-Z][a-z]+\s\d"]

    for line in lines:
        if any(re.match(pattern, line.strip()) for pattern in header_patterns):
            formatted_text += f"# {line.strip()}\n"
        else:
            formatted_text += f"{line.strip()}\n"

    return formatted_text

def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
    """
    Parses a PDF file, extracts text, tables, and images, and formats the output.

    Args:
        pdf_file: Path to the uploaded PDF file.
        output_format: Desired output format ("JSON", "Markdown", or "HTML").
        progress: Gradio Progress object for displaying progress.

    Returns:
        tuple: Extracted text and download data in the specified format.
            Returns an empty string and None if there is an error.
    """
    try:
        with open(pdf_file, 'rb') as file:
            text = ""
            tables = []
            images = []

            for page in extract_pages(file):
                for element in page:
                    if isinstance(element, LTTextBoxHorizontal):
                        text += element.get_text()
                    elif isinstance(element, (LTFigure, LTImage)):
                        print(f"Processing element: {type(element)}")
                        save_image(element, images)

            formatted_text = detect_headers(text)

            with pdfplumber.open(pdf_file) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    for table in page.extract_tables():
                        try:
                            if len(table) > 0 and len(set(table[0])) != len(table[0]):
                                unique_columns = []
                                for col in table[0]:
                                    if col in unique_columns:
                                        col = f"{col}_{unique_columns.count(col)}"
                                    unique_columns.append(col)
                                df = pd.DataFrame(table[1:], columns=unique_columns)
                            else:
                                df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
                            tables.append(df)
                        except Exception as e:
                            print(f"Error processing table: {e}")

            with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="." + output_format.lower()) as tmp:
                if output_format == "JSON":
                    json_data = {
                        "text": formatted_text,
                        "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
                        "images": images
                    }
                    json.dump(json_data, tmp, ensure_ascii=False, indent=4)
                elif output_format == "Markdown":
                    markdown_text = f"# Extracted Text\n\n{formatted_text}\n\n# Tables\n"
                    for i, table in enumerate(tables):
                        if not table.columns.duplicated().any():
                            markdown_text += f"## Table {i+1}\n"
                            markdown_text += table.to_markdown(index=False) + "\n\n"
                    markdown_text += "\n\n# Images\n\n"
                    for image in images:
                        image_path = os.path.join(os.getcwd(), image["filename"])
                        markdown_text += f'![Image]({image_path})\n'
                    tmp.write(markdown_text)
                elif output_format == "HTML":
                    html_text = f"<p>{formatted_text}</p>\n\n<h2>Tables</h2>\n"
                    for i, table in enumerate(tables):
                        if not table.columns.duplicated().any():
                            html_text += f"<h2>Table {i+1}</h2>\n"
                            html_text += table.to_html() + "<br>"
                    html_text += "\n\n<h2>Images</h2>\n\n"
                    for image in images:
                        image_path = os.path.join(os.getcwd(), image["filename"])
                        html_text += f'<img src="{image_path}" alt="Image"><br>\n'
                    tmp.write(html_text)
                download_path = tmp.name

            return formatted_text, download_path

    except Exception as main_e:
        traceback.print_exc()  # Print full traceback to console
        print(f"A main error occurred: {main_e}")
        return "", None

iface = gr.Interface(
    fn=parse_pdf,
    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
    outputs=[
        gr.Text(label="Output Text"),
        gr.File(label="Download Output")
    ],
    title="PDF Parser",
    description="Parse a PDF and choose the output format."
)

if __name__ == "__main__":
    iface.launch()  # Temporarily disable sharing for debugging