ADucatez commited on
Commit
71aaedb
·
1 Parent(s): 25e496d

Add of tabula functionality + exception management

Browse files
Files changed (2) hide show
  1. reduce_and_convert_PDF.py +93 -35
  2. requirements.txt +2 -0
reduce_and_convert_PDF.py CHANGED
@@ -8,6 +8,7 @@ import openpyxl
8
  from openpyxl.utils.dataframe import dataframe_to_rows
9
  from openpyxl.styles import numbers
10
  from openpyxl.worksheet.table import Table, TableStyleInfo
 
11
 
12
  def extract_pages(pdf_path, start_page, end_page, output_path):
13
  reader = PdfReader(pdf_path)
@@ -42,14 +43,23 @@ def reduce_pdf(pdf_folder,reduced_pdf_folder):
42
 
43
 
44
 
45
- def extract_tables_camelot(pdf_path):
46
- # Extract tables from the PDF file using Camelot
47
- tables = camelot.read_pdf(pdf_path, pages='all',flavor='stream')
48
- return tables
 
 
 
 
 
 
 
 
 
49
 
50
  def get_numeric_count(row):
51
  # Get the number of numerical values in a row
52
- return sum(1 for x in row if (pd.notna(pd.to_numeric(x.replace(",", "").strip('()'), errors='coerce')) or x in ['-','–']))
53
 
54
 
55
  def convert_to_numeric(value):
@@ -57,23 +67,32 @@ def convert_to_numeric(value):
57
  value = '-' + value[1:-1]
58
 
59
  if all(char.isdigit() or char in '-,.' for char in str(value)):
60
- cleaned_value = pd.to_numeric(value.replace(',', ''), errors='coerce')
61
  return cleaned_value
62
  return value
63
 
64
  def get_headers(dataframes):
65
  # Get the dataframe columns name
66
- if len(dataframes) >= 2:
67
- df_for_columns_names = dataframes[1]
68
- else:
69
- df_for_columns_names = dataframes[0]
70
- for i, row in df_for_columns_names.iterrows():
71
- numeric_count = get_numeric_count(row)
72
- if numeric_count >= 2:
73
- first_numeric_idx = i
74
- break
 
 
 
 
 
 
 
 
 
 
75
 
76
- df_for_columns_names = df_for_columns_names.astype(str).where(pd.notna(df_for_columns_names), "")
77
 
78
  new_header = [" ".join(filter(None, df_for_columns_names.iloc[:first_numeric_idx, col].values)) for col in range(df_for_columns_names.shape[1])]
79
 
@@ -81,7 +100,12 @@ def get_headers(dataframes):
81
 
82
  def clean_dataframe(df,header):
83
  # Rule : if a row is not numerical, merge it with the next numerical one
84
- df.columns = header
 
 
 
 
 
85
  first_numeric_idx = None
86
  for i, row in df.iterrows():
87
  numeric_count = get_numeric_count(row)
@@ -104,14 +128,14 @@ def clean_dataframe(df,header):
104
  buffer = list(df.iloc[i].copy())
105
  else:
106
  buffer = [
107
- " ".join(filter(lambda x: x not in [None, "None", ""], [buffer[j], df.iloc[i, j]]))
108
  for j in range(df.shape[1])
109
  ]
110
  merged_rows.append(i)
111
  else:
112
  if buffer is not None:
113
  df.iloc[i] = [
114
- " ".join(filter(lambda x: x not in [None, "None", ""], [buffer[j], df.iloc[i, j]]))
115
  for j in range(df.shape[1])
116
  ]
117
  buffer = None
@@ -120,9 +144,7 @@ def clean_dataframe(df,header):
120
  return clean_df
121
 
122
 
123
- def clean_and_concatenate_tables(tables):
124
- dataframes = [table.df for table in tables]
125
-
126
  for i in range(len(dataframes)):
127
  df = dataframes[i]
128
  row_counts = df.apply(lambda row: row.notna().sum() - (row.astype(str) == "").sum(), axis=1)
@@ -130,28 +152,57 @@ def clean_and_concatenate_tables(tables):
130
  dataframes[i] = df.loc[row_counts >= 1, col_counts >= 3].reset_index(drop = True)
131
 
132
  new_header = get_headers(dataframes)
133
-
134
  cleaned_dfs = []
135
 
136
  for df in dataframes:
137
- cleaned_dfs.append(clean_dataframe(df,new_header))
 
 
 
 
 
 
 
 
 
 
138
 
139
  concatenated_df = pd.concat(cleaned_dfs, ignore_index=True)
 
 
 
 
 
 
140
  return concatenated_df
141
 
142
 
 
143
  def convert_to_excel(reduced_pdf_folder, output_folder):
144
  if os.path.exists(output_folder):
145
  shutil.rmtree(output_folder)
146
  os.makedirs(output_folder)
147
 
 
 
 
 
 
 
 
 
 
 
148
  for filename in os.listdir(reduced_pdf_folder):
149
  if filename.endswith('.pdf'):
 
 
150
  pdf_path = os.path.join(reduced_pdf_folder, filename)
151
- tables = extract_tables_camelot(pdf_path)
152
- if tables:
153
- concatenated_df = clean_and_concatenate_tables(tables)
154
-
 
155
  excel_path = os.path.join(output_folder, filename.replace('.pdf', '.xlsx'))
156
 
157
  for col in concatenated_df.columns:
@@ -172,7 +223,7 @@ def convert_to_excel(reduced_pdf_folder, output_folder):
172
  ws.cell(row=r_idx + 1, column=c_idx + 1, value=numeric_value)
173
  percentage_cells.append((r_idx + 1, c_idx + 1))
174
 
175
- tab = Table(displayName = "Table1",ref=ws.dimensions)
176
  style = TableStyleInfo(
177
  name="TableStyleMedium9",
178
  showFirstColumn=False,
@@ -186,22 +237,29 @@ def convert_to_excel(reduced_pdf_folder, output_folder):
186
 
187
  # Ajuster la largeur des colonnes
188
  for column_cells in ws.columns:
189
- length = min(max(len(str(cell.value)) for cell in column_cells),30)
190
  ws.column_dimensions[column_cells[0].column_letter].width = length + 2
191
 
192
  for row, col in percentage_cells:
193
  cell = ws.cell(row=row, column=col)
194
  cell.number_format = numbers.BUILTIN_FORMATS[10]
195
  wb.save(excel_path)
196
- print(f'Saved {filename} as Excel file')
197
- else:
198
- print(f'No tables found in {filename}')
199
- shutil.make_archive(base_name="./output", format='zip', root_dir="./outputs")
 
 
 
 
 
 
 
200
 
201
 
202
  def reduce_and_convert(input_folder):
203
  reduced_pdf_folder = "./reduced_pdf"
204
- output_folder = './outputs'
205
  reduce_pdf(input_folder,reduced_pdf_folder)
206
  convert_to_excel(reduced_pdf_folder, output_folder)
207
 
 
8
  from openpyxl.utils.dataframe import dataframe_to_rows
9
  from openpyxl.styles import numbers
10
  from openpyxl.worksheet.table import Table, TableStyleInfo
11
+ import tabula
12
 
13
  def extract_pages(pdf_path, start_page, end_page, output_path):
14
  reader = PdfReader(pdf_path)
 
43
 
44
 
45
 
46
+
47
+
48
+
49
+ def extract_tables_camelot_or_tabula(pdf_path):
50
+ try:
51
+ tables = camelot.read_pdf(pdf_path, pages='all', flavor='stream')
52
+ return [table.df for table in tables]
53
+ except Exception as e:
54
+ print(f"Camelot failed with error: {e}")
55
+ print("Trying with Tabula...")
56
+ dfs = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
57
+ return dfs
58
+
59
 
60
  def get_numeric_count(row):
61
  # Get the number of numerical values in a row
62
+ return sum(1 for x in row if (pd.notna(pd.to_numeric(str(x).replace(",", "").strip('()'), errors='coerce')) or x in ['-','–']))
63
 
64
 
65
  def convert_to_numeric(value):
 
67
  value = '-' + value[1:-1]
68
 
69
  if all(char.isdigit() or char in '-,.' for char in str(value)):
70
+ cleaned_value = pd.to_numeric(str(value).replace(',', ''), errors='coerce')
71
  return cleaned_value
72
  return value
73
 
74
  def get_headers(dataframes):
75
  # Get the dataframe columns name
76
+ # if len(dataframes) >= 2:
77
+ # df_for_columns_names = dataframes[1]
78
+ # else:
79
+ # df_for_columns_names = dataframes[0]
80
+ first_numeric_idx = None
81
+
82
+ order = list(range(1, len(dataframes))) + [0]
83
+
84
+ for k in order:
85
+ if first_numeric_idx is None:
86
+ df_for_columns_names = dataframes[k]
87
+ df_for_columns_names = df_for_columns_names.astype(str).where(pd.notna(df_for_columns_names), "")
88
+ for i, row in df_for_columns_names.iterrows():
89
+ numeric_count = get_numeric_count(row)
90
+ if numeric_count >= 2:
91
+ first_numeric_idx = i
92
+ break
93
+ if first_numeric_idx is not None:
94
+ break
95
 
 
96
 
97
  new_header = [" ".join(filter(None, df_for_columns_names.iloc[:first_numeric_idx, col].values)) for col in range(df_for_columns_names.shape[1])]
98
 
 
100
 
101
  def clean_dataframe(df,header):
102
  # Rule : if a row is not numerical, merge it with the next numerical one
103
+ if len(header) < len(df.columns):
104
+ df.columns = header + [f"Unnamed_{i}" for i in range(len(header), len(df.columns))]
105
+ elif len(header) > len(df.columns):
106
+ df.columns = header[:len(df.columns)]
107
+ else:
108
+ df.columns = header
109
  first_numeric_idx = None
110
  for i, row in df.iterrows():
111
  numeric_count = get_numeric_count(row)
 
128
  buffer = list(df.iloc[i].copy())
129
  else:
130
  buffer = [
131
+ " ".join(filter(lambda x: x not in [None, "None", ""], [str(buffer[j]), str(df.iloc[i, j])]))
132
  for j in range(df.shape[1])
133
  ]
134
  merged_rows.append(i)
135
  else:
136
  if buffer is not None:
137
  df.iloc[i] = [
138
+ " ".join(filter(lambda x: x not in [None, "None", ""], [str(buffer[j]), str(df.iloc[i, j])]))
139
  for j in range(df.shape[1])
140
  ]
141
  buffer = None
 
144
  return clean_df
145
 
146
 
147
+ def clean_and_concatenate_tables(dataframes):
 
 
148
  for i in range(len(dataframes)):
149
  df = dataframes[i]
150
  row_counts = df.apply(lambda row: row.notna().sum() - (row.astype(str) == "").sum(), axis=1)
 
152
  dataframes[i] = df.loc[row_counts >= 1, col_counts >= 3].reset_index(drop = True)
153
 
154
  new_header = get_headers(dataframes)
 
155
  cleaned_dfs = []
156
 
157
  for df in dataframes:
158
+ if len(df.columns) >= 3 :
159
+ cleaned_dfs.append(clean_dataframe(df,new_header))
160
+
161
+ cleaned_dfs = [df.reset_index(drop=True) for df in cleaned_dfs if isinstance(df, pd.DataFrame) and not df.empty]
162
+
163
+ if not cleaned_dfs:
164
+ raise ValueError("After cleaning, no valid dataframe left.")
165
+
166
+ for _, df in enumerate(cleaned_dfs):
167
+ if any(col == '' for col in df.columns): # Check if there are empty column names
168
+ df.columns = [f"col_{j}" if col == '' else col for j, col in enumerate(df.columns)]
169
 
170
  concatenated_df = pd.concat(cleaned_dfs, ignore_index=True)
171
+
172
+ if concatenated_df.shape[0] <= 4 :
173
+ raise ValueError("Dataframe too small, probable mistake")
174
+ if concatenated_df.shape[1] <= 2 :
175
+ raise ValueError("Less than 3 columns, probable mistake")
176
+ print("Success of conversion : dataframe of shape ",concatenated_df.shape)
177
  return concatenated_df
178
 
179
 
180
+
181
  def convert_to_excel(reduced_pdf_folder, output_folder):
182
  if os.path.exists(output_folder):
183
  shutil.rmtree(output_folder)
184
  os.makedirs(output_folder)
185
 
186
+ failed_folder = os.path.join(output_folder, "failed_to_convert")
187
+ if os.path.exists(failed_folder):
188
+ shutil.rmtree(failed_folder)
189
+ os.makedirs(failed_folder)
190
+
191
+ if os.path.exists("./log_errors.txt"):
192
+ os.remove("./log_errors.txt")
193
+
194
+ number_of_files = 0
195
+ number_of_fails = 0
196
  for filename in os.listdir(reduced_pdf_folder):
197
  if filename.endswith('.pdf'):
198
+ number_of_files += 1
199
+ print("Trying to convert :", filename, "to excel")
200
  pdf_path = os.path.join(reduced_pdf_folder, filename)
201
+ try:
202
+ dataframes = extract_tables_camelot_or_tabula(pdf_path)
203
+ if not dataframes:
204
+ raise ValueError(f'No tables found in {filename}')
205
+ concatenated_df = clean_and_concatenate_tables(dataframes)
206
  excel_path = os.path.join(output_folder, filename.replace('.pdf', '.xlsx'))
207
 
208
  for col in concatenated_df.columns:
 
223
  ws.cell(row=r_idx + 1, column=c_idx + 1, value=numeric_value)
224
  percentage_cells.append((r_idx + 1, c_idx + 1))
225
 
226
+ tab = Table(displayName="Table1", ref=ws.dimensions)
227
  style = TableStyleInfo(
228
  name="TableStyleMedium9",
229
  showFirstColumn=False,
 
237
 
238
  # Ajuster la largeur des colonnes
239
  for column_cells in ws.columns:
240
+ length = min(max(len(str(cell.value)) for cell in column_cells), 30)
241
  ws.column_dimensions[column_cells[0].column_letter].width = length + 2
242
 
243
  for row, col in percentage_cells:
244
  cell = ws.cell(row=row, column=col)
245
  cell.number_format = numbers.BUILTIN_FORMATS[10]
246
  wb.save(excel_path)
247
+ except Exception as e:
248
+ error_message = f"Error converting {filename}: {e}"
249
+ print(error_message)
250
+ number_of_fails += 1
251
+ shutil.copy(pdf_path, os.path.join(failed_folder, filename))
252
+ with open("./log_errors.txt", "a") as log_file:
253
+ log_file.write(error_message + "\n")
254
+ print("Number of files considered : ",number_of_files)
255
+ print("Number of success : ", number_of_files - number_of_fails)
256
+ print("Number of fails : ", number_of_fails)
257
+ shutil.make_archive(base_name="./output", format='zip', root_dir=output_folder)
258
 
259
 
260
  def reduce_and_convert(input_folder):
261
  reduced_pdf_folder = "./reduced_pdf"
262
+ output_folder = "./outputs"
263
  reduce_pdf(input_folder,reduced_pdf_folder)
264
  convert_to_excel(reduced_pdf_folder, output_folder)
265
 
requirements.txt CHANGED
@@ -2,3 +2,5 @@ PyPDF2
2
  pandas
3
  camelot-py
4
  openpyxl
 
 
 
2
  pandas
3
  camelot-py
4
  openpyxl
5
+ PyCryptodome
6
+ tabula