File size: 5,850 Bytes
7dec78f
 
e92fbe1
7dec78f
17d36dc
 
 
b8d5f22
6544d14
b8d5f22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f7269c
b8d5f22
 
 
 
 
 
 
 
 
 
 
 
 
 
17d36dc
 
 
 
b8d5f22
 
 
 
 
 
 
 
 
 
 
6544d14
 
 
 
0a3a380
 
 
 
 
 
 
 
 
 
 
6544d14
b8d5f22
 
 
 
 
0a3a380
b8d5f22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7dec78f
 
 
7c51401
7dec78f
 
 
 
 
 
 
 
5e94ef1
17d36dc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import json
import gradio as gr
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
import os
import io
from PIL import Image
import pandas as pd
import pdfplumber

def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
    """
    Parses a PDF file, extracts text, tables, and images, and formats the output.

    Args:
        pdf_file: Path to the uploaded PDF file.
        output_format: Desired output format ("JSON", "Markdown", or "HTML").
        progress: Gradio Progress object for displaying progress.

    Returns:
        tuple: Extracted text and download data in the specified format.
               Returns an empty string and None if there is an error.
    """
    try:
        with open(pdf_file, 'rb') as file:
            pages = list(extract_pages(file))  # Convert generator to list
            text = ""
            tables = []
            images = []

            # Iterate through pages and extract text and images
            for i, page in enumerate(pages):
                progress(i / len(pages))  # Update progress bar
                for element in page:
                    if isinstance(element, LTTextBoxHorizontal):
                        text += element.get_text()
                    elif isinstance(element, (LTFigure, LTImage)):
                        try:
                            if hasattr(element, 'stream'):
                                image_data = element.stream.read()
                                image = Image.open(io.BytesIO(image_data))
                                image_filename = f"extracted_image_{len(images)}.png"
                                image.save(image_filename)
                                images.append({"filename": image_filename})
                            else:
                                for child in element:
                                    if isinstance(child, LTImage):
                                        image_data = child.stream.read()
                                        image = Image.open(io.BytesIO(image_data))
                                        image_filename = f"extracted_image_{len(images)}.png"
                                        image.save(image_filename)
                                        images.append({"filename": image_filename})
                        except Exception as e:
                            print(f"Error extracting image: {e}")

            # Enhanced table extraction using pdfplumber
            with pdfplumber.open(pdf_file) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    for table in page.extract_tables():
                        # Handle potential duplicate columns
                        if len(table) > 0 and len(set(table[0])) != len(table[0]):
                            # If duplicate columns exist, try to create unique column names
                            unique_columns = []
                            for col in table[0]:
                                if col in unique_columns:
                                    col = f"{col}_{unique_columns.count(col)}"  # Append a counter
                                unique_columns.append(col)
                            df = pd.DataFrame(table[1:], columns=unique_columns)
                        else:
                            df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None) 
                        tables.append(df)

            # Format extracted data based on user selection
            if output_format == "JSON":
                json_data = {
                    "text": text,
                    "tables": [table.to_dict(orient='records') for table in tables],  # Use 'records' for better handling of duplicate columns
                    "images": images
                }
                download_data = json.dumps(json_data, indent=4)  # Add indentation for readability
            elif output_format == "Markdown":
                markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
                for i, table in enumerate(tables):
                    markdown_text += f"## Table {i+1}\n"
                    markdown_text += table.to_markdown(index=False) + "\n\n"

                # Image embedding in Markdown (using relative paths)
                markdown_text += "\n\n# Images\n\n"
                for image in images:
                  image_path = os.path.join(os.getcwd(), image["filename"])
                  markdown_text += f'![Image]({image_path})\n'

                download_data = markdown_text
            elif output_format == "HTML":
                html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
                for i, table in enumerate(tables):
                    html_text += f"<h2>Table {i+1}</h2>\n"
                    html_text += table.to_html() + "<br>"

                # Image embedding in HTML (using relative paths)
                html_text += "\n\n<h2>Images</h2>\n\n"
                for image in images:
                  image_path = os.path.join(os.getcwd(), image["filename"])
                  html_text += f'<img src="{image_path}" alt="Image"><br>\n'

                download_data = html_text.encode("utf-8")  # Encode for HTML download
            return text, download_data

    except Exception as main_e:
        print(f"A main error occurred: {main_e}")
        return "", None # Return empty string and None in case of error

iface = gr.Interface(
    fn=parse_pdf,
    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])], # Remove gr.Progress() from inputs
    outputs=[
        gr.Text(label="Output Text"),
        gr.File(label="Download Output")
    ],
    title="PDF Parser",
    description="Parse a PDF and choose the output format."
)

if __name__ == "__main__":
    iface.launch(share=False)