lodhrangpt commited on
Commit
5ad10db
·
verified ·
1 Parent(s): 4f34dce

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -0
app.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import pandas as pd
3
+
4
+ # Function to convert PDF to DataFrame
5
+ def pdf_to_dataframe(pdf_path):
6
+ # Open the PDF document
7
+ doc = fitz.open(pdf_path)
8
+
9
+ # Initialize an empty list to store text blocks
10
+ text_blocks = []
11
+
12
+ # Iterate through each page in the PDF
13
+ for page_num in range(len(doc)):
14
+ page = doc.load_page(page_num)
15
+ text = page.get_text("text")
16
+ text_blocks.append(text)
17
+
18
+ # Join all text blocks into a single string
19
+ full_text = "\n".join(text_blocks)
20
+
21
+ # Split the text into lines
22
+ lines = full_text.split('\n')
23
+
24
+ # Create a DataFrame from the lines
25
+ df = pd.DataFrame(lines, columns=['Text'])
26
+
27
+ return df
28
+
29
+ # Function to save DataFrame to Excel
30
+ def dataframe_to_excel(df, excel_path):
31
+ # Save the DataFrame to an Excel file
32
+ df.to_excel(excel_path, index=False)
33
+
34
+ # Main function
35
+ def main():
36
+ pdf_path = 'input.pdf' # Path to your input PDF file
37
+ excel_path = 'output.xlsx' # Path where the Excel file will be saved
38
+
39
+ # Convert PDF to DataFrame
40
+ df = pdf_to_dataframe(pdf_path)
41
+
42
+ # Save DataFrame to Excel
43
+ dataframe_to_excel(df, excel_path)
44
+
45
+ print(f"Excel saved to {excel_path}")
46
+
47
+ if __name__ == "__main__":
48
+ main()