File size: 5,583 Bytes
7dec78f
 
e92fbe1
7dec78f
17d36dc
 
 
b8d5f22
6544d14
7cb3598
6a30f2e
b8d5f22
 
7cb3598
 
 
 
 
 
 
 
 
 
 
 
b8d5f22
 
 
 
 
 
875f540
b8d5f22
 
 
 
 
 
 
17d36dc
 
 
 
b8d5f22
 
 
 
 
 
 
 
 
 
 
6544d14
 
 
0a3a380
 
 
 
3403d47
0a3a380
 
 
875f540
6544d14
b8d5f22
ce01472
3403d47
 
 
 
 
 
ce01472
3403d47
 
 
 
 
 
 
 
 
 
ce01472
3403d47
 
 
 
 
 
 
 
 
 
ce01472
7cb3598
6a30f2e
3403d47
b8d5f22
 
6a30f2e
b8d5f22
875f540
7dec78f
 
 
3403d47
7dec78f
 
 
 
 
 
 
 
5e94ef1
ce01472
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import json
import gradio as gr
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
import os
import io
from PIL import Image
import pandas as pd
import pdfplumber
import tempfile
import traceback

def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
    """
    Parses a PDF file, extracts text, tables, and images, and formats the output.

    Args:
        pdf_file: Path to the uploaded PDF file.
        output_format: Desired output format ("JSON", "Markdown", or "HTML").
        progress: Gradio Progress object for displaying progress.

    Returns:
        tuple: Extracted text and download data in the specified format.
            Returns an empty string and None if there is an error.
    """
    try:
        with open(pdf_file, 'rb') as file:
            text = ""
            tables = []
            images = []

            for page in extract_pages(file):
                for element in page:
                    if isinstance(element, LTTextBoxHorizontal):
                        text += element.get_text()
                    elif isinstance(element, (LTFigure, LTImage)):
                        try:
                            if hasattr(element, 'stream'):
                                image_data = element.stream.read()
                                image = Image.open(io.BytesIO(image_data))
                                image_filename = f"extracted_image_{len(images)}.png"
                                image.save(image_filename)
                                images.append({"filename": image_filename})
                            else:
                                for child in element:
                                    if isinstance(child, LTImage):
                                        image_data = child.stream.read()
                                        image = Image.open(io.BytesIO(image_data))
                                        image_filename = f"extracted_image_{len(images)}.png"
                                        image.save(image_filename)
                                        images.append({"filename": image_filename})
                        except Exception as e:
                            print(f"Error extracting image: {e}")

            with pdfplumber.open(pdf_file) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    for table in page.extract_tables():
                        if len(table) > 0 and len(set(table[0])) != len(table[0]):
                            unique_columns = []
                            for col in table[0]:
                                if col in unique_columns:
                                    col = f"{col}_{unique_columns.count(col)}"
                                unique_columns.append(col)
                            df = pd.DataFrame(table[1:], columns=unique_columns)
                        else:
                            df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
                        tables.append(df)

            with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="." + output_format.lower()) as tmp:
                if output_format == "JSON":
                    json_data = {
                        "text": text,
                        "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
                        "images": images
                    }
                    json.dump(json_data, tmp, ensure_ascii=False, indent=4)
                elif output_format == "Markdown":
                    markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
                    for i, table in enumerate(tables):
                        if not table.columns.duplicated().any():
                            markdown_text += f"## Table {i+1}\n"
                            markdown_text += table.to_markdown(index=False) + "\n\n"
                    markdown_text += "\n\n# Images\n\n"
                    for image in images:
                        image_path = os.path.join(os.getcwd(), image["filename"])
                        markdown_text += f'![Image]({image_path})\n'
                    tmp.write(markdown_text)
                elif output_format == "HTML":
                    html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
                    for i, table in enumerate(tables):
                        if not table.columns.duplicated().any():
                            html_text += f"<h2>Table {i+1}</h2>\n"
                            html_text += table.to_html() + "<br>"
                    html_text += "\n\n<h2>Images</h2>\n\n"
                    for image in images:
                        image_path = os.path.join(os.getcwd(), image["filename"])
                        html_text += f'<img src="{image_path}" alt="Image"><br>\n'
                    tmp.write(html_text)
                download_path = tmp.name

            return text, download_path

    except Exception as main_e:
        traceback.print_exc()  # Print full traceback to console
        print(f"A main error occurred: {main_e}")
        return "", None

iface = gr.Interface(
    fn=parse_pdf,
    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
    outputs=[
        gr.Text(label="Output Text"),
        gr.File(label="Download Output")
    ],
    title="PDF Parser",
    description="Parse a PDF and choose the output format."
)

if __name__ == "__main__":
    iface.launch()  # Temporarily disable sharing for debugging