Aymeric Ducatez commited on
Commit
4f68924
·
1 Parent(s): 02c1e42

Commit initial

Browse files
Files changed (2) hide show
  1. app.py +218 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pathlib import Path
3
+ import os
4
+ import shutil
5
+ import re
6
+ from PyPDF2 import PdfReader, PdfWriter
7
+ import pandas as pd
8
+ import camelot
9
+ import openpyxl
10
+ from openpyxl.utils.dataframe import dataframe_to_rows
11
+ from openpyxl.styles import numbers
12
+ from openpyxl.worksheet.table import Table, TableStyleInfo
13
+
14
+ def extract_pages(pdf_path, start_page, end_page, output_path):
15
+ reader = PdfReader(pdf_path)
16
+ writer = PdfWriter()
17
+
18
+ for page_num in range(start_page, end_page + 1):
19
+ if page_num <= len(reader.pages):
20
+ writer.add_page(reader.pages[page_num - 1])
21
+
22
+ with open(output_path, 'wb') as output_pdf_file:
23
+ writer.write(output_pdf_file)
24
+
25
+
26
+ def reduce_pdf(pdf_folder,reduced_pdf_folder):
27
+ if not os.path.exists(reduced_pdf_folder):
28
+ os.makedirs(reduced_pdf_folder)
29
+
30
+ for filename in os.listdir(pdf_folder):
31
+ if filename.endswith('.pdf'):
32
+ match = re.search(r'(\d+)-(\d+)', filename)
33
+ if match:
34
+ start_page = int(match.group(1))
35
+ end_page = int(match.group(2))
36
+ base_name = re.sub(r'_\d+-\d+\.pdf$', '.pdf', filename)
37
+
38
+ pdf_path = os.path.join(pdf_folder, filename)
39
+ output_path = os.path.join(reduced_pdf_folder, base_name)
40
+
41
+ extract_pages(pdf_path, start_page, end_page, output_path)
42
+ print(f'Processed {filename} -> {base_name}')
43
+
44
+ def extract_tables_camelot(pdf_path):
45
+ # Extract tables from the PDF file using Camelot
46
+ tables = camelot.read_pdf(pdf_path, pages='all',flavor='stream')
47
+ return tables
48
+
49
+ def get_numeric_count(row):
50
+ # Get the number of numerical values in a row
51
+ return sum(1 for x in row if (pd.notna(pd.to_numeric(x.replace(",", "").strip('()'), errors='coerce')) or x in ['-','–']))
52
+
53
+
54
+ def convert_to_numeric(value):
55
+ if isinstance(value, str) and value.startswith('(') and value.endswith(')'):
56
+ value = '-' + value[1:-1]
57
+
58
+ if all(char.isdigit() or char in '-,.' for char in str(value)):
59
+ cleaned_value = pd.to_numeric(value.replace(',', ''), errors='coerce')
60
+ return cleaned_value
61
+ return value
62
+
63
+ def get_headers(dataframes):
64
+ # Get the dataframe columns name
65
+ if len(dataframes) >= 2:
66
+ df_for_columns_names = dataframes[1]
67
+ else:
68
+ df_for_columns_names = dataframes[0]
69
+ for i, row in df_for_columns_names.iterrows():
70
+ numeric_count = get_numeric_count(row)
71
+ if numeric_count >= 2:
72
+ first_numeric_idx = i
73
+ break
74
+
75
+ df_for_columns_names = df_for_columns_names.astype(str).where(pd.notna(df_for_columns_names), "")
76
+
77
+ new_header = [" ".join(filter(None, df_for_columns_names.iloc[:first_numeric_idx, col].values)) for col in range(df_for_columns_names.shape[1])]
78
+
79
+ return new_header
80
+
81
+ def clean_dataframe(df,header):
82
+ # Rule : if a row is not numerical, merge it with the next numerical one
83
+ df.columns = header
84
+ first_numeric_idx = None
85
+ for i, row in df.iterrows():
86
+ numeric_count = get_numeric_count(row)
87
+ if numeric_count >= 2:
88
+ first_numeric_idx = i
89
+ break
90
+
91
+ df = df.iloc[first_numeric_idx:]
92
+ df = df.reset_index(drop=True)
93
+
94
+ merged_rows = []
95
+ buffer = None
96
+
97
+ for i in range(len(df)):
98
+ row = df.iloc[i]
99
+ numeric_count = get_numeric_count(row)
100
+
101
+ if numeric_count < 2:
102
+ if buffer is None:
103
+ buffer = list(df.iloc[i].copy())
104
+ else:
105
+ buffer = [
106
+ " ".join(filter(lambda x: x not in [None, "None", ""], [buffer[j], df.iloc[i, j]]))
107
+ for j in range(df.shape[1])
108
+ ]
109
+ merged_rows.append(i)
110
+ else:
111
+ if buffer is not None:
112
+ df.iloc[i] = [
113
+ " ".join(filter(lambda x: x not in [None, "None", ""], [buffer[j], df.iloc[i, j]]))
114
+ for j in range(df.shape[1])
115
+ ]
116
+ buffer = None
117
+
118
+ clean_df = df.drop(merged_rows).reset_index(drop=True)
119
+ return clean_df
120
+
121
+
122
+ def clean_and_concatenate_tables(tables):
123
+ dataframes = [table.df for table in tables]
124
+
125
+ for i in range(len(dataframes)):
126
+ df = dataframes[i]
127
+ row_counts = df.apply(lambda row: row.notna().sum() - (row.astype(str) == "").sum(), axis=1)
128
+ col_counts = df.apply(lambda col: col.notna().sum() - (col.astype(str) == "").sum(), axis=0)
129
+ dataframes[i] = df.loc[row_counts >= 1, col_counts >= 3].reset_index(drop = True)
130
+
131
+ new_header = get_headers(dataframes)
132
+
133
+ cleaned_dfs = []
134
+
135
+ for df in dataframes:
136
+ cleaned_dfs.append(clean_dataframe(df,new_header))
137
+
138
+ concatenated_df = pd.concat(cleaned_dfs, ignore_index=True)
139
+ return concatenated_df
140
+
141
+
142
+ def convert_to_excel(reduced_pdf_folder, output_folder):
143
+ if not os.path.exists(output_folder):
144
+ os.makedirs(output_folder)
145
+
146
+ for filename in os.listdir(reduced_pdf_folder):
147
+ if filename.endswith('.pdf'):
148
+ pdf_path = os.path.join(reduced_pdf_folder, filename)
149
+ tables = extract_tables_camelot(pdf_path)
150
+ if tables:
151
+ concatenated_df = clean_and_concatenate_tables(tables)
152
+
153
+ excel_path = os.path.join(output_folder, filename.replace('.pdf', '.xlsx'))
154
+
155
+ for col in concatenated_df.columns:
156
+ concatenated_df[col] = concatenated_df[col].apply(convert_to_numeric)
157
+
158
+ wb = openpyxl.Workbook()
159
+ ws = wb.active
160
+
161
+ percentage_cells = []
162
+
163
+ # Add the DataFrame data to the worksheet
164
+ for r_idx, r in enumerate(dataframe_to_rows(concatenated_df, index=False, header=True)):
165
+ ws.append(r)
166
+ for c_idx, value in enumerate(r):
167
+ if isinstance(value, str) and value.endswith('%'):
168
+ numeric_value = pd.to_numeric(value.strip('%'), errors='coerce') / 100
169
+ ws.cell(row=r_idx + 1, column=c_idx + 1, value=numeric_value)
170
+ percentage_cells.append((r_idx + 1, c_idx + 1))
171
+
172
+ tab = Table(displayName = "Table1",ref=ws.dimensions)
173
+ style = TableStyleInfo(
174
+ name="TableStyleMedium9",
175
+ showFirstColumn=False,
176
+ showLastColumn=False,
177
+ showRowStripes=True,
178
+ showColumnStripes=True
179
+ )
180
+ tab.tableStyleInfo = style
181
+
182
+ ws.add_table(tab)
183
+
184
+ # Ajuster la largeur des colonnes
185
+ for column_cells in ws.columns:
186
+ length = min(max(len(str(cell.value)) for cell in column_cells),30)
187
+ ws.column_dimensions[column_cells[0].column_letter].width = length + 2
188
+
189
+ for row, col in percentage_cells:
190
+ cell = ws.cell(row=row, column=col)
191
+ cell.number_format = numbers.BUILTIN_FORMATS[10]
192
+ wb.save(excel_path)
193
+ print(f'Saved {filename} as Excel file')
194
+ else:
195
+ print(f'No tables found in {filename}')
196
+ shutil.make_archive(base_name="./output", format='zip', root_dir="./outputs")
197
+
198
+ def reduce_and_convert(input_folder):
199
+ reduced_pdf_folder = "./reduced_pdf"
200
+ output_folder = './outputs'
201
+ reduce_pdf(input_folder,reduced_pdf_folder)
202
+ convert_to_excel(reduced_pdf_folder, output_folder)
203
+
204
+
205
+ def ui(input_folder):
206
+ zip_path = reduce_and_convert(input_folder)
207
+ zip_path = "./output.zip"
208
+ return zip_path
209
+
210
+ with gr.Blocks() as appli:
211
+ gr.Markdown("## PDF Reduction & Conversion Tool")
212
+ input_folder = gr.Textbox(label="Enter Input Folder Path")
213
+ process_button = gr.Button("Process Files")
214
+ download_link = gr.File(label="Download Processed Zip")
215
+
216
+ process_button.click(fn=ui, inputs=input_folder, outputs=download_link)
217
+
218
+ appli.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ PyPDF2
3
+ pandas
4
+ camelot-py
5
+ openpyxl
6
+ shutil
7
+ os
8
+ pathlib