Spaces:

lodhrangpt
/

pdf_to_excel

Runtime error

lodhrangpt commited on Nov 19, 2024

Commit

5ad10db

verified ·

1 Parent(s): 4f34dce

Create app.py

Files changed (1) hide show

app.py ADDED Viewed

+import fitz  # PyMuPDF
+import pandas as pd
+# Function to convert PDF to DataFrame
+def pdf_to_dataframe(pdf_path):
+    # Open the PDF document
+    doc = fitz.open(pdf_path)
+    # Initialize an empty list to store text blocks
+    text_blocks = []
+    # Iterate through each page in the PDF
+    for page_num in range(len(doc)):
+        page = doc.load_page(page_num)
+        text = page.get_text("text")
+        text_blocks.append(text)
+    # Join all text blocks into a single string
+    full_text = "\n".join(text_blocks)
+    # Split the text into lines
+    lines = full_text.split('\n')
+    # Create a DataFrame from the lines
+    df = pd.DataFrame(lines, columns=['Text'])
+    return df
+# Function to save DataFrame to Excel
+def dataframe_to_excel(df, excel_path):
+    # Save the DataFrame to an Excel file
+    df.to_excel(excel_path, index=False)
+# Main function
+def main():
+    pdf_path = 'input.pdf'  # Path to your input PDF file
+    excel_path = 'output.xlsx'  # Path where the Excel file will be saved
+    # Convert PDF to DataFrame
+    df = pdf_to_dataframe(pdf_path)
+    # Save DataFrame to Excel
+    dataframe_to_excel(df, excel_path)
+    print(f"Excel saved to {excel_path}")
+if __name__ == "__main__":
+    main()