Spaces:
Runtime error
Runtime error
| import fitz # PyMuPDF | |
| import pandas as pd | |
| # Function to convert PDF to DataFrame | |
| def pdf_to_dataframe(pdf_path): | |
| # Open the PDF document | |
| doc = fitz.open(pdf_path) | |
| # Initialize an empty list to store text blocks | |
| text_blocks = [] | |
| # Iterate through each page in the PDF | |
| for page_num in range(len(doc)): | |
| page = doc.load_page(page_num) | |
| text = page.get_text("text") | |
| text_blocks.append(text) | |
| # Join all text blocks into a single string | |
| full_text = "\n".join(text_blocks) | |
| # Split the text into lines | |
| lines = full_text.split('\n') | |
| # Create a DataFrame from the lines | |
| df = pd.DataFrame(lines, columns=['Text']) | |
| return df | |
| # Function to save DataFrame to Excel | |
| def dataframe_to_excel(df, excel_path): | |
| # Save the DataFrame to an Excel file | |
| df.to_excel(excel_path, index=False) | |
| # Main function | |
| def main(): | |
| pdf_path = 'input.pdf' # Path to your input PDF file | |
| excel_path = 'output.xlsx' # Path where the Excel file will be saved | |
| # Convert PDF to DataFrame | |
| df = pdf_to_dataframe(pdf_path) | |
| # Save DataFrame to Excel | |
| dataframe_to_excel(df, excel_path) | |
| print(f"Excel saved to {excel_path}") | |
| if __name__ == "__main__": | |
| main() |