ADucatez commited on
Commit
25e496d
·
1 Parent(s): 716c402

Reorganisation of files

Browse files
Files changed (3) hide show
  1. README.md +4 -0
  2. app.py +1 -207
  3. reduce_and_convert_PDF.py +211 -0
README.md CHANGED
@@ -12,3 +12,7 @@ short_description: Convert CbCR to Excel using camelot and manual treatement
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
15
+
16
+
17
+ This program accept folder as input and returns a .zip file with the corresponding excel files.
18
+ The file names should include the name "CbCR" followed by the indices of the pages where the CbCR tables are located. If it is splitted on several pages, a dash can be used (for example Acciona_2023_CbCR_176.pdf, or Shell_2023_CbCR_55-57.pdf)
app.py CHANGED
@@ -1,214 +1,8 @@
1
  import gradio as gr
2
  import os
3
  import shutil
4
- import re
5
- from PyPDF2 import PdfReader, PdfWriter
6
- import pandas as pd
7
- import camelot
8
- import openpyxl
9
- from openpyxl.utils.dataframe import dataframe_to_rows
10
- from openpyxl.styles import numbers
11
- from openpyxl.worksheet.table import Table, TableStyleInfo
12
  import tempfile
13
-
14
- def extract_pages(pdf_path, start_page, end_page, output_path):
15
- reader = PdfReader(pdf_path)
16
- writer = PdfWriter()
17
- for page_num in range(start_page, end_page + 1):
18
- if page_num <= len(reader.pages):
19
- writer.add_page(reader.pages[page_num - 1])
20
-
21
- with open(output_path, 'wb') as output_pdf_file:
22
- writer.write(output_pdf_file)
23
-
24
-
25
- def reduce_pdf(pdf_folder,reduced_pdf_folder):
26
- if os.path.exists(reduced_pdf_folder):
27
- shutil.rmtree(reduced_pdf_folder)
28
- os.makedirs(reduced_pdf_folder)
29
-
30
- for filename in os.listdir(pdf_folder):
31
- if filename.endswith('.pdf'):
32
- match = re.search(r'_CbCR_(\d+)(?:-(\d+))?', filename)
33
- if match:
34
- start_page = int(match.group(1))
35
- end_page = int(match.group(2)) if match.group(2) else start_page
36
- base_name = re.sub(r'_CbCR_\d+(?:-\d+)?\.pdf$', '_CbCR.pdf', filename)
37
- pdf_path = os.path.join(pdf_folder, filename)
38
- output_path = os.path.join(reduced_pdf_folder, base_name)
39
-
40
- extract_pages(pdf_path, start_page, end_page, output_path)
41
- print(f'Processed {filename} -> {base_name}')
42
-
43
-
44
-
45
-
46
-
47
- def extract_tables_camelot(pdf_path):
48
- # Extract tables from the PDF file using Camelot
49
- tables = camelot.read_pdf(pdf_path, pages='all',flavor='stream')
50
- return tables
51
-
52
- def get_numeric_count(row):
53
- # Get the number of numerical values in a row
54
- return sum(1 for x in row if (pd.notna(pd.to_numeric(x.replace(",", "").strip('()'), errors='coerce')) or x in ['-','–']))
55
-
56
-
57
- def convert_to_numeric(value):
58
- if isinstance(value, str) and value.startswith('(') and value.endswith(')'):
59
- value = '-' + value[1:-1]
60
-
61
- if all(char.isdigit() or char in '-,.' for char in str(value)):
62
- cleaned_value = pd.to_numeric(value.replace(',', ''), errors='coerce')
63
- return cleaned_value
64
- return value
65
-
66
- def get_headers(dataframes):
67
- # Get the dataframe columns name
68
- if len(dataframes) >= 2:
69
- df_for_columns_names = dataframes[1]
70
- else:
71
- df_for_columns_names = dataframes[0]
72
- for i, row in df_for_columns_names.iterrows():
73
- numeric_count = get_numeric_count(row)
74
- if numeric_count >= 2:
75
- first_numeric_idx = i
76
- break
77
-
78
- df_for_columns_names = df_for_columns_names.astype(str).where(pd.notna(df_for_columns_names), "")
79
-
80
- new_header = [" ".join(filter(None, df_for_columns_names.iloc[:first_numeric_idx, col].values)) for col in range(df_for_columns_names.shape[1])]
81
-
82
- return new_header
83
-
84
- def clean_dataframe(df,header):
85
- # Rule : if a row is not numerical, merge it with the next numerical one
86
- df.columns = header
87
- first_numeric_idx = None
88
- for i, row in df.iterrows():
89
- numeric_count = get_numeric_count(row)
90
- if numeric_count >= 2:
91
- first_numeric_idx = i
92
- break
93
-
94
- df = df.iloc[first_numeric_idx:]
95
- df = df.reset_index(drop=True)
96
-
97
- merged_rows = []
98
- buffer = None
99
-
100
- for i in range(len(df)):
101
- row = df.iloc[i]
102
- numeric_count = get_numeric_count(row)
103
-
104
- if numeric_count < 2:
105
- if buffer is None:
106
- buffer = list(df.iloc[i].copy())
107
- else:
108
- buffer = [
109
- " ".join(filter(lambda x: x not in [None, "None", ""], [buffer[j], df.iloc[i, j]]))
110
- for j in range(df.shape[1])
111
- ]
112
- merged_rows.append(i)
113
- else:
114
- if buffer is not None:
115
- df.iloc[i] = [
116
- " ".join(filter(lambda x: x not in [None, "None", ""], [buffer[j], df.iloc[i, j]]))
117
- for j in range(df.shape[1])
118
- ]
119
- buffer = None
120
-
121
- clean_df = df.drop(merged_rows).reset_index(drop=True)
122
- return clean_df
123
-
124
-
125
- def clean_and_concatenate_tables(tables):
126
- dataframes = [table.df for table in tables]
127
-
128
- for i in range(len(dataframes)):
129
- df = dataframes[i]
130
- row_counts = df.apply(lambda row: row.notna().sum() - (row.astype(str) == "").sum(), axis=1)
131
- col_counts = df.apply(lambda col: col.notna().sum() - (col.astype(str) == "").sum(), axis=0)
132
- dataframes[i] = df.loc[row_counts >= 1, col_counts >= 3].reset_index(drop = True)
133
-
134
- new_header = get_headers(dataframes)
135
-
136
- cleaned_dfs = []
137
-
138
- for df in dataframes:
139
- cleaned_dfs.append(clean_dataframe(df,new_header))
140
-
141
- concatenated_df = pd.concat(cleaned_dfs, ignore_index=True)
142
- return concatenated_df
143
-
144
-
145
- def convert_to_excel(reduced_pdf_folder, output_folder):
146
- if os.path.exists(output_folder):
147
- shutil.rmtree(output_folder)
148
- os.makedirs(output_folder)
149
-
150
- for filename in os.listdir(reduced_pdf_folder):
151
- if filename.endswith('.pdf'):
152
- pdf_path = os.path.join(reduced_pdf_folder, filename)
153
- tables = extract_tables_camelot(pdf_path)
154
- if tables:
155
- concatenated_df = clean_and_concatenate_tables(tables)
156
-
157
- excel_path = os.path.join(output_folder, filename.replace('.pdf', '.xlsx'))
158
-
159
- for col in concatenated_df.columns:
160
- if any(str(cell).strip() and not str(cell).strip().startswith('(') for cell in concatenated_df[col]):
161
- concatenated_df[col] = concatenated_df[col].apply(convert_to_numeric)
162
-
163
- wb = openpyxl.Workbook()
164
- ws = wb.active
165
-
166
- percentage_cells = []
167
-
168
- # Add the DataFrame data to the worksheet
169
- for r_idx, r in enumerate(dataframe_to_rows(concatenated_df, index=False, header=True)):
170
- ws.append(r)
171
- for c_idx, value in enumerate(r):
172
- if isinstance(value, str) and value.endswith('%'):
173
- numeric_value = pd.to_numeric(value.strip('%'), errors='coerce') / 100
174
- ws.cell(row=r_idx + 1, column=c_idx + 1, value=numeric_value)
175
- percentage_cells.append((r_idx + 1, c_idx + 1))
176
-
177
- tab = Table(displayName = "Table1",ref=ws.dimensions)
178
- style = TableStyleInfo(
179
- name="TableStyleMedium9",
180
- showFirstColumn=False,
181
- showLastColumn=False,
182
- showRowStripes=True,
183
- showColumnStripes=True
184
- )
185
- tab.tableStyleInfo = style
186
-
187
- ws.add_table(tab)
188
-
189
- # Ajuster la largeur des colonnes
190
- for column_cells in ws.columns:
191
- length = min(max(len(str(cell.value)) for cell in column_cells),30)
192
- ws.column_dimensions[column_cells[0].column_letter].width = length + 2
193
-
194
- for row, col in percentage_cells:
195
- cell = ws.cell(row=row, column=col)
196
- cell.number_format = numbers.BUILTIN_FORMATS[10]
197
- wb.save(excel_path)
198
- print(f'Saved {filename} as Excel file')
199
- else:
200
- print(f'No tables found in {filename}')
201
- shutil.make_archive(base_name="./output", format='zip', root_dir="./outputs")
202
-
203
-
204
- def reduce_and_convert(input_folder):
205
- reduced_pdf_folder = "./reduced_pdf"
206
- output_folder = './outputs'
207
- reduce_pdf(input_folder,reduced_pdf_folder)
208
- convert_to_excel(reduced_pdf_folder, output_folder)
209
-
210
-
211
-
212
 
213
  def clear_gradio_temp(exclude_files):
214
  temp_dir = tempfile.gettempdir()
 
1
  import gradio as gr
2
  import os
3
  import shutil
 
 
 
 
 
 
 
 
4
  import tempfile
5
+ from reduce_and_convert_PDF import reduce_and_convert
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def clear_gradio_temp(exclude_files):
8
  temp_dir = tempfile.gettempdir()
reduce_and_convert_PDF.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import re
4
+ from PyPDF2 import PdfReader, PdfWriter
5
+ import pandas as pd
6
+ import camelot
7
+ import openpyxl
8
+ from openpyxl.utils.dataframe import dataframe_to_rows
9
+ from openpyxl.styles import numbers
10
+ from openpyxl.worksheet.table import Table, TableStyleInfo
11
+
12
+ def extract_pages(pdf_path, start_page, end_page, output_path):
13
+ reader = PdfReader(pdf_path)
14
+ writer = PdfWriter()
15
+ for page_num in range(start_page, end_page + 1):
16
+ if page_num <= len(reader.pages):
17
+ writer.add_page(reader.pages[page_num - 1])
18
+
19
+ with open(output_path, 'wb') as output_pdf_file:
20
+ writer.write(output_pdf_file)
21
+
22
+
23
+ def reduce_pdf(pdf_folder,reduced_pdf_folder):
24
+ if os.path.exists(reduced_pdf_folder):
25
+ shutil.rmtree(reduced_pdf_folder)
26
+ os.makedirs(reduced_pdf_folder)
27
+
28
+ for filename in os.listdir(pdf_folder):
29
+ if filename.endswith('.pdf'):
30
+ match = re.search(r'_CbCR_(\d+)(?:-(\d+))?', filename)
31
+ if match:
32
+ start_page = int(match.group(1))
33
+ end_page = int(match.group(2)) if match.group(2) else start_page
34
+ base_name = re.sub(r'_CbCR_\d+(?:-\d+)?\.pdf$', '_CbCR.pdf', filename)
35
+ pdf_path = os.path.join(pdf_folder, filename)
36
+ output_path = os.path.join(reduced_pdf_folder, base_name)
37
+
38
+ extract_pages(pdf_path, start_page, end_page, output_path)
39
+ print(f'Processed {filename} -> {base_name}')
40
+
41
+
42
+
43
+
44
+
45
+ def extract_tables_camelot(pdf_path):
46
+ # Extract tables from the PDF file using Camelot
47
+ tables = camelot.read_pdf(pdf_path, pages='all',flavor='stream')
48
+ return tables
49
+
50
+ def get_numeric_count(row):
51
+ # Get the number of numerical values in a row
52
+ return sum(1 for x in row if (pd.notna(pd.to_numeric(x.replace(",", "").strip('()'), errors='coerce')) or x in ['-','–']))
53
+
54
+
55
+ def convert_to_numeric(value):
56
+ if isinstance(value, str) and value.startswith('(') and value.endswith(')'):
57
+ value = '-' + value[1:-1]
58
+
59
+ if all(char.isdigit() or char in '-,.' for char in str(value)):
60
+ cleaned_value = pd.to_numeric(value.replace(',', ''), errors='coerce')
61
+ return cleaned_value
62
+ return value
63
+
64
+ def get_headers(dataframes):
65
+ # Get the dataframe columns name
66
+ if len(dataframes) >= 2:
67
+ df_for_columns_names = dataframes[1]
68
+ else:
69
+ df_for_columns_names = dataframes[0]
70
+ for i, row in df_for_columns_names.iterrows():
71
+ numeric_count = get_numeric_count(row)
72
+ if numeric_count >= 2:
73
+ first_numeric_idx = i
74
+ break
75
+
76
+ df_for_columns_names = df_for_columns_names.astype(str).where(pd.notna(df_for_columns_names), "")
77
+
78
+ new_header = [" ".join(filter(None, df_for_columns_names.iloc[:first_numeric_idx, col].values)) for col in range(df_for_columns_names.shape[1])]
79
+
80
+ return new_header
81
+
82
+ def clean_dataframe(df,header):
83
+ # Rule : if a row is not numerical, merge it with the next numerical one
84
+ df.columns = header
85
+ first_numeric_idx = None
86
+ for i, row in df.iterrows():
87
+ numeric_count = get_numeric_count(row)
88
+ if numeric_count >= 2:
89
+ first_numeric_idx = i
90
+ break
91
+
92
+ df = df.iloc[first_numeric_idx:]
93
+ df = df.reset_index(drop=True)
94
+
95
+ merged_rows = []
96
+ buffer = None
97
+
98
+ for i in range(len(df)):
99
+ row = df.iloc[i]
100
+ numeric_count = get_numeric_count(row)
101
+
102
+ if numeric_count < 2:
103
+ if buffer is None:
104
+ buffer = list(df.iloc[i].copy())
105
+ else:
106
+ buffer = [
107
+ " ".join(filter(lambda x: x not in [None, "None", ""], [buffer[j], df.iloc[i, j]]))
108
+ for j in range(df.shape[1])
109
+ ]
110
+ merged_rows.append(i)
111
+ else:
112
+ if buffer is not None:
113
+ df.iloc[i] = [
114
+ " ".join(filter(lambda x: x not in [None, "None", ""], [buffer[j], df.iloc[i, j]]))
115
+ for j in range(df.shape[1])
116
+ ]
117
+ buffer = None
118
+
119
+ clean_df = df.drop(merged_rows).reset_index(drop=True)
120
+ return clean_df
121
+
122
+
123
+ def clean_and_concatenate_tables(tables):
124
+ dataframes = [table.df for table in tables]
125
+
126
+ for i in range(len(dataframes)):
127
+ df = dataframes[i]
128
+ row_counts = df.apply(lambda row: row.notna().sum() - (row.astype(str) == "").sum(), axis=1)
129
+ col_counts = df.apply(lambda col: col.notna().sum() - (col.astype(str) == "").sum(), axis=0)
130
+ dataframes[i] = df.loc[row_counts >= 1, col_counts >= 3].reset_index(drop = True)
131
+
132
+ new_header = get_headers(dataframes)
133
+
134
+ cleaned_dfs = []
135
+
136
+ for df in dataframes:
137
+ cleaned_dfs.append(clean_dataframe(df,new_header))
138
+
139
+ concatenated_df = pd.concat(cleaned_dfs, ignore_index=True)
140
+ return concatenated_df
141
+
142
+
143
+ def convert_to_excel(reduced_pdf_folder, output_folder):
144
+ if os.path.exists(output_folder):
145
+ shutil.rmtree(output_folder)
146
+ os.makedirs(output_folder)
147
+
148
+ for filename in os.listdir(reduced_pdf_folder):
149
+ if filename.endswith('.pdf'):
150
+ pdf_path = os.path.join(reduced_pdf_folder, filename)
151
+ tables = extract_tables_camelot(pdf_path)
152
+ if tables:
153
+ concatenated_df = clean_and_concatenate_tables(tables)
154
+
155
+ excel_path = os.path.join(output_folder, filename.replace('.pdf', '.xlsx'))
156
+
157
+ for col in concatenated_df.columns:
158
+ if any(str(cell).strip() and not str(cell).strip().startswith('(') for cell in concatenated_df[col]):
159
+ concatenated_df[col] = concatenated_df[col].apply(convert_to_numeric)
160
+
161
+ wb = openpyxl.Workbook()
162
+ ws = wb.active
163
+
164
+ percentage_cells = []
165
+
166
+ # Add the DataFrame data to the worksheet
167
+ for r_idx, r in enumerate(dataframe_to_rows(concatenated_df, index=False, header=True)):
168
+ ws.append(r)
169
+ for c_idx, value in enumerate(r):
170
+ if isinstance(value, str) and value.endswith('%'):
171
+ numeric_value = pd.to_numeric(value.strip('%'), errors='coerce') / 100
172
+ ws.cell(row=r_idx + 1, column=c_idx + 1, value=numeric_value)
173
+ percentage_cells.append((r_idx + 1, c_idx + 1))
174
+
175
+ tab = Table(displayName = "Table1",ref=ws.dimensions)
176
+ style = TableStyleInfo(
177
+ name="TableStyleMedium9",
178
+ showFirstColumn=False,
179
+ showLastColumn=False,
180
+ showRowStripes=True,
181
+ showColumnStripes=True
182
+ )
183
+ tab.tableStyleInfo = style
184
+
185
+ ws.add_table(tab)
186
+
187
+ # Ajuster la largeur des colonnes
188
+ for column_cells in ws.columns:
189
+ length = min(max(len(str(cell.value)) for cell in column_cells),30)
190
+ ws.column_dimensions[column_cells[0].column_letter].width = length + 2
191
+
192
+ for row, col in percentage_cells:
193
+ cell = ws.cell(row=row, column=col)
194
+ cell.number_format = numbers.BUILTIN_FORMATS[10]
195
+ wb.save(excel_path)
196
+ print(f'Saved {filename} as Excel file')
197
+ else:
198
+ print(f'No tables found in {filename}')
199
+ shutil.make_archive(base_name="./output", format='zip', root_dir="./outputs")
200
+
201
+
202
+ def reduce_and_convert(input_folder):
203
+ reduced_pdf_folder = "./reduced_pdf"
204
+ output_folder = './outputs'
205
+ reduce_pdf(input_folder,reduced_pdf_folder)
206
+ convert_to_excel(reduced_pdf_folder, output_folder)
207
+
208
+
209
+ if __name__ == "__main__":
210
+ input_folder = "../example_pdf"
211
+ reduce_and_convert(input_folder)