import fitz # PyMuPDF import pandas as pd # Function to convert PDF to DataFrame def pdf_to_dataframe(pdf_path): # Open the PDF document doc = fitz.open(pdf_path) # Initialize an empty list to store text blocks text_blocks = [] # Iterate through each page in the PDF for page_num in range(len(doc)): page = doc.load_page(page_num) text = page.get_text("text") text_blocks.append(text) # Join all text blocks into a single string full_text = "\n".join(text_blocks) # Split the text into lines lines = full_text.split('\n') # Create a DataFrame from the lines df = pd.DataFrame(lines, columns=['Text']) return df # Function to save DataFrame to Excel def dataframe_to_excel(df, excel_path): # Save the DataFrame to an Excel file df.to_excel(excel_path, index=False) # Main function def main(): pdf_path = 'input.pdf' # Path to your input PDF file excel_path = 'output.xlsx' # Path where the Excel file will be saved # Convert PDF to DataFrame df = pdf_to_dataframe(pdf_path) # Save DataFrame to Excel dataframe_to_excel(df, excel_path) print(f"Excel saved to {excel_path}") if __name__ == "__main__": main()