Spaces:
Running
Running
import gradio as gr | |
import fitz # PyMuPDF | |
import pandas as pd | |
# Function to convert PDF to DataFrame | |
def pdf_to_dataframe(pdf_path): | |
# Open the PDF document | |
doc = fitz.open(pdf_path) | |
# Initialize an empty list to store text blocks | |
text_blocks = [] | |
# Iterate through each page in the PDF | |
for page_num in range(len(doc)): | |
page = doc.load_page(page_num) | |
text = page.get_text("text") | |
text_blocks.append(text) | |
# Join all text blocks into a single string | |
full_text = "\n".join(text_blocks) | |
# Split the text into lines | |
lines = full_text.split('\n') | |
# Create a DataFrame from the lines | |
df = pd.DataFrame(lines, columns=['Text']) | |
return df | |
# Function to save DataFrame to Excel | |
def dataframe_to_excel(df, excel_path): | |
# Save the DataFrame to an Excel file | |
df.to_excel(excel_path, index=False) | |
# Main function | |
def main(): | |
def pdf_to_excel_function(pdf_file): | |
# Save the uploaded PDF to a temporary file | |
pdf_path = "temp.pdf" | |
with open(pdf_path, "wb") as f: | |
f.write(pdf_file.read()) | |
# Convert PDF to DataFrame | |
df = pdf_to_dataframe(pdf_path) | |
# Save DataFrame to Excel | |
excel_path = "output.xlsx" | |
dataframe_to_excel(df, excel_path) | |
return excel_path | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=pdf_to_excel_function, | |
inputs=gr.File(label="Upload PDF File"), | |
outputs=gr.File[label="Download Excel File"], | |
title="PDF to Excel Converter", | |
description="Convert a PDF file to an Excel file." | |
) | |
# Launch the interface | |
iface.launch() | |
if __name__ == "__main__": | |
main() |