Aymeric Ducatez commited on
Commit
0e91585
·
1 Parent(s): bc95c0b

Correct one-page mistake + parenthesis columns

Browse files
Files changed (1) hide show
  1. app.py +29 -22
app.py CHANGED
@@ -10,10 +10,10 @@ from openpyxl.utils.dataframe import dataframe_to_rows
10
  from openpyxl.styles import numbers
11
  from openpyxl.worksheet.table import Table, TableStyleInfo
12
 
 
13
  def extract_pages(pdf_path, start_page, end_page, output_path):
14
  reader = PdfReader(pdf_path)
15
  writer = PdfWriter()
16
-
17
  for page_num in range(start_page, end_page + 1):
18
  if page_num <= len(reader.pages):
19
  writer.add_page(reader.pages[page_num - 1])
@@ -22,25 +22,28 @@ def extract_pages(pdf_path, start_page, end_page, output_path):
22
  writer.write(output_pdf_file)
23
 
24
 
25
- def reduce_pdf(input_folder,reduced_pdf_folder):
26
- if not os.path.exists(reduced_pdf_folder):
27
- os.makedirs(reduced_pdf_folder)
28
- print(os.listdir(input_folder))
29
- for filename in os.listdir(input_folder):
30
- print(filename)
31
  if filename.endswith('.pdf'):
32
- match = re.search(r'(\d+)-(\d+)', filename)
33
  if match:
34
  start_page = int(match.group(1))
35
- end_page = int(match.group(2))
36
- base_name = re.sub(r'_\d+-\d+\.pdf$', '.pdf', filename)
37
-
38
- pdf_path = os.path.join(input_folder, filename)
39
  output_path = os.path.join(reduced_pdf_folder, base_name)
40
 
41
  extract_pages(pdf_path, start_page, end_page, output_path)
42
  print(f'Processed {filename} -> {base_name}')
43
 
 
 
 
 
44
  def extract_tables_camelot(pdf_path):
45
  # Extract tables from the PDF file using Camelot
46
  tables = camelot.read_pdf(pdf_path, pages='all',flavor='stream')
@@ -140,9 +143,10 @@ def clean_and_concatenate_tables(tables):
140
 
141
 
142
  def convert_to_excel(reduced_pdf_folder, output_folder):
143
- if not os.path.exists(output_folder):
144
- os.makedirs(output_folder)
145
-
 
146
  for filename in os.listdir(reduced_pdf_folder):
147
  if filename.endswith('.pdf'):
148
  pdf_path = os.path.join(reduced_pdf_folder, filename)
@@ -153,7 +157,8 @@ def convert_to_excel(reduced_pdf_folder, output_folder):
153
  excel_path = os.path.join(output_folder, filename.replace('.pdf', '.xlsx'))
154
 
155
  for col in concatenated_df.columns:
156
- concatenated_df[col] = concatenated_df[col].apply(convert_to_numeric)
 
157
 
158
  wb = openpyxl.Workbook()
159
  ws = wb.active
@@ -195,6 +200,7 @@ def convert_to_excel(reduced_pdf_folder, output_folder):
195
  print(f'No tables found in {filename}')
196
  shutil.make_archive(base_name="./output", format='zip', root_dir="./outputs")
197
 
 
198
  def reduce_and_convert(input_folder):
199
  reduced_pdf_folder = "./reduced_pdf"
200
  output_folder = './outputs'
@@ -202,22 +208,23 @@ def reduce_and_convert(input_folder):
202
  convert_to_excel(reduced_pdf_folder, output_folder)
203
 
204
 
 
205
  def ui(input_files):
206
  output_zip = "./output.zip"
207
  if os.path.exists(output_zip):
208
  os.remove(output_zip)
209
 
210
- extract_folder = "./input_folder"
211
- if os.path.exists(extract_folder):
212
- shutil.rmtree(extract_folder)
213
- os.makedirs(extract_folder)
214
 
215
  # Move files into the extract_folder
216
  for file_path in input_files:
217
  print(file_path)
218
- shutil.copy(file_path, extract_folder)
219
 
220
- reduce_and_convert(extract_folder)
221
 
222
  return output_zip
223
 
 
10
  from openpyxl.styles import numbers
11
  from openpyxl.worksheet.table import Table, TableStyleInfo
12
 
13
+
14
  def extract_pages(pdf_path, start_page, end_page, output_path):
15
  reader = PdfReader(pdf_path)
16
  writer = PdfWriter()
 
17
  for page_num in range(start_page, end_page + 1):
18
  if page_num <= len(reader.pages):
19
  writer.add_page(reader.pages[page_num - 1])
 
22
  writer.write(output_pdf_file)
23
 
24
 
25
+ def reduce_pdf(pdf_folder,reduced_pdf_folder):
26
+ if os.path.exists(reduced_pdf_folder):
27
+ shutil.rmtree(reduced_pdf_folder)
28
+ os.makedirs(reduced_pdf_folder)
29
+
30
+ for filename in os.listdir(pdf_folder):
31
  if filename.endswith('.pdf'):
32
+ match = re.search(r'_CbCR_(\d+)(?:-(\d+))?', filename)
33
  if match:
34
  start_page = int(match.group(1))
35
+ end_page = int(match.group(2)) if match.group(2) else start_page
36
+ base_name = re.sub(r'_CbCR_\d+(?:-\d+)?\.pdf$', '_CbCR.pdf', filename)
37
+ pdf_path = os.path.join(pdf_folder, filename)
 
38
  output_path = os.path.join(reduced_pdf_folder, base_name)
39
 
40
  extract_pages(pdf_path, start_page, end_page, output_path)
41
  print(f'Processed {filename} -> {base_name}')
42
 
43
+
44
+
45
+
46
+
47
  def extract_tables_camelot(pdf_path):
48
  # Extract tables from the PDF file using Camelot
49
  tables = camelot.read_pdf(pdf_path, pages='all',flavor='stream')
 
143
 
144
 
145
  def convert_to_excel(reduced_pdf_folder, output_folder):
146
+ if os.path.exists(output_folder):
147
+ shutil.rmtree(output_folder)
148
+ os.makedirs(output_folder)
149
+
150
  for filename in os.listdir(reduced_pdf_folder):
151
  if filename.endswith('.pdf'):
152
  pdf_path = os.path.join(reduced_pdf_folder, filename)
 
157
  excel_path = os.path.join(output_folder, filename.replace('.pdf', '.xlsx'))
158
 
159
  for col in concatenated_df.columns:
160
+ if any(str(cell).strip() and not str(cell).strip().startswith('(') for cell in concatenated_df[col]):
161
+ concatenated_df[col] = concatenated_df[col].apply(convert_to_numeric)
162
 
163
  wb = openpyxl.Workbook()
164
  ws = wb.active
 
200
  print(f'No tables found in {filename}')
201
  shutil.make_archive(base_name="./output", format='zip', root_dir="./outputs")
202
 
203
+
204
  def reduce_and_convert(input_folder):
205
  reduced_pdf_folder = "./reduced_pdf"
206
  output_folder = './outputs'
 
208
  convert_to_excel(reduced_pdf_folder, output_folder)
209
 
210
 
211
+
212
  def ui(input_files):
213
  output_zip = "./output.zip"
214
  if os.path.exists(output_zip):
215
  os.remove(output_zip)
216
 
217
+ input_folder = "./input_folder"
218
+ if os.path.exists(input_folder):
219
+ shutil.rmtree(input_folder)
220
+ os.makedirs(input_folder)
221
 
222
  # Move files into the extract_folder
223
  for file_path in input_files:
224
  print(file_path)
225
+ shutil.copy(file_path, input_folder)
226
 
227
+ reduce_and_convert(input_folder)
228
 
229
  return output_zip
230