Spaces:
Running
Running
Aymeric Ducatez
commited on
Commit
·
0e91585
1
Parent(s):
bc95c0b
Correct one-page mistake + parenthesis columns
Browse files
app.py
CHANGED
@@ -10,10 +10,10 @@ from openpyxl.utils.dataframe import dataframe_to_rows
|
|
10 |
from openpyxl.styles import numbers
|
11 |
from openpyxl.worksheet.table import Table, TableStyleInfo
|
12 |
|
|
|
13 |
def extract_pages(pdf_path, start_page, end_page, output_path):
|
14 |
reader = PdfReader(pdf_path)
|
15 |
writer = PdfWriter()
|
16 |
-
|
17 |
for page_num in range(start_page, end_page + 1):
|
18 |
if page_num <= len(reader.pages):
|
19 |
writer.add_page(reader.pages[page_num - 1])
|
@@ -22,25 +22,28 @@ def extract_pages(pdf_path, start_page, end_page, output_path):
|
|
22 |
writer.write(output_pdf_file)
|
23 |
|
24 |
|
25 |
-
def reduce_pdf(
|
26 |
-
if
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
if filename.endswith('.pdf'):
|
32 |
-
match = re.search(r'(\d+)
|
33 |
if match:
|
34 |
start_page = int(match.group(1))
|
35 |
-
end_page = int(match.group(2))
|
36 |
-
base_name = re.sub(r'
|
37 |
-
|
38 |
-
pdf_path = os.path.join(input_folder, filename)
|
39 |
output_path = os.path.join(reduced_pdf_folder, base_name)
|
40 |
|
41 |
extract_pages(pdf_path, start_page, end_page, output_path)
|
42 |
print(f'Processed {filename} -> {base_name}')
|
43 |
|
|
|
|
|
|
|
|
|
44 |
def extract_tables_camelot(pdf_path):
|
45 |
# Extract tables from the PDF file using Camelot
|
46 |
tables = camelot.read_pdf(pdf_path, pages='all',flavor='stream')
|
@@ -140,9 +143,10 @@ def clean_and_concatenate_tables(tables):
|
|
140 |
|
141 |
|
142 |
def convert_to_excel(reduced_pdf_folder, output_folder):
|
143 |
-
if
|
144 |
-
|
145 |
-
|
|
|
146 |
for filename in os.listdir(reduced_pdf_folder):
|
147 |
if filename.endswith('.pdf'):
|
148 |
pdf_path = os.path.join(reduced_pdf_folder, filename)
|
@@ -153,7 +157,8 @@ def convert_to_excel(reduced_pdf_folder, output_folder):
|
|
153 |
excel_path = os.path.join(output_folder, filename.replace('.pdf', '.xlsx'))
|
154 |
|
155 |
for col in concatenated_df.columns:
|
156 |
-
|
|
|
157 |
|
158 |
wb = openpyxl.Workbook()
|
159 |
ws = wb.active
|
@@ -195,6 +200,7 @@ def convert_to_excel(reduced_pdf_folder, output_folder):
|
|
195 |
print(f'No tables found in {filename}')
|
196 |
shutil.make_archive(base_name="./output", format='zip', root_dir="./outputs")
|
197 |
|
|
|
198 |
def reduce_and_convert(input_folder):
|
199 |
reduced_pdf_folder = "./reduced_pdf"
|
200 |
output_folder = './outputs'
|
@@ -202,22 +208,23 @@ def reduce_and_convert(input_folder):
|
|
202 |
convert_to_excel(reduced_pdf_folder, output_folder)
|
203 |
|
204 |
|
|
|
205 |
def ui(input_files):
|
206 |
output_zip = "./output.zip"
|
207 |
if os.path.exists(output_zip):
|
208 |
os.remove(output_zip)
|
209 |
|
210 |
-
|
211 |
-
if os.path.exists(
|
212 |
-
shutil.rmtree(
|
213 |
-
os.makedirs(
|
214 |
|
215 |
# Move files into the extract_folder
|
216 |
for file_path in input_files:
|
217 |
print(file_path)
|
218 |
-
shutil.copy(file_path,
|
219 |
|
220 |
-
reduce_and_convert(
|
221 |
|
222 |
return output_zip
|
223 |
|
|
|
10 |
from openpyxl.styles import numbers
|
11 |
from openpyxl.worksheet.table import Table, TableStyleInfo
|
12 |
|
13 |
+
|
14 |
def extract_pages(pdf_path, start_page, end_page, output_path):
|
15 |
reader = PdfReader(pdf_path)
|
16 |
writer = PdfWriter()
|
|
|
17 |
for page_num in range(start_page, end_page + 1):
|
18 |
if page_num <= len(reader.pages):
|
19 |
writer.add_page(reader.pages[page_num - 1])
|
|
|
22 |
writer.write(output_pdf_file)
|
23 |
|
24 |
|
25 |
+
def reduce_pdf(pdf_folder,reduced_pdf_folder):
|
26 |
+
if os.path.exists(reduced_pdf_folder):
|
27 |
+
shutil.rmtree(reduced_pdf_folder)
|
28 |
+
os.makedirs(reduced_pdf_folder)
|
29 |
+
|
30 |
+
for filename in os.listdir(pdf_folder):
|
31 |
if filename.endswith('.pdf'):
|
32 |
+
match = re.search(r'_CbCR_(\d+)(?:-(\d+))?', filename)
|
33 |
if match:
|
34 |
start_page = int(match.group(1))
|
35 |
+
end_page = int(match.group(2)) if match.group(2) else start_page
|
36 |
+
base_name = re.sub(r'_CbCR_\d+(?:-\d+)?\.pdf$', '_CbCR.pdf', filename)
|
37 |
+
pdf_path = os.path.join(pdf_folder, filename)
|
|
|
38 |
output_path = os.path.join(reduced_pdf_folder, base_name)
|
39 |
|
40 |
extract_pages(pdf_path, start_page, end_page, output_path)
|
41 |
print(f'Processed {filename} -> {base_name}')
|
42 |
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
def extract_tables_camelot(pdf_path):
|
48 |
# Extract tables from the PDF file using Camelot
|
49 |
tables = camelot.read_pdf(pdf_path, pages='all',flavor='stream')
|
|
|
143 |
|
144 |
|
145 |
def convert_to_excel(reduced_pdf_folder, output_folder):
|
146 |
+
if os.path.exists(output_folder):
|
147 |
+
shutil.rmtree(output_folder)
|
148 |
+
os.makedirs(output_folder)
|
149 |
+
|
150 |
for filename in os.listdir(reduced_pdf_folder):
|
151 |
if filename.endswith('.pdf'):
|
152 |
pdf_path = os.path.join(reduced_pdf_folder, filename)
|
|
|
157 |
excel_path = os.path.join(output_folder, filename.replace('.pdf', '.xlsx'))
|
158 |
|
159 |
for col in concatenated_df.columns:
|
160 |
+
if any(str(cell).strip() and not str(cell).strip().startswith('(') for cell in concatenated_df[col]):
|
161 |
+
concatenated_df[col] = concatenated_df[col].apply(convert_to_numeric)
|
162 |
|
163 |
wb = openpyxl.Workbook()
|
164 |
ws = wb.active
|
|
|
200 |
print(f'No tables found in {filename}')
|
201 |
shutil.make_archive(base_name="./output", format='zip', root_dir="./outputs")
|
202 |
|
203 |
+
|
204 |
def reduce_and_convert(input_folder):
|
205 |
reduced_pdf_folder = "./reduced_pdf"
|
206 |
output_folder = './outputs'
|
|
|
208 |
convert_to_excel(reduced_pdf_folder, output_folder)
|
209 |
|
210 |
|
211 |
+
|
212 |
def ui(input_files):
|
213 |
output_zip = "./output.zip"
|
214 |
if os.path.exists(output_zip):
|
215 |
os.remove(output_zip)
|
216 |
|
217 |
+
input_folder = "./input_folder"
|
218 |
+
if os.path.exists(input_folder):
|
219 |
+
shutil.rmtree(input_folder)
|
220 |
+
os.makedirs(input_folder)
|
221 |
|
222 |
# Move files into the extract_folder
|
223 |
for file_path in input_files:
|
224 |
print(file_path)
|
225 |
+
shutil.copy(file_path, input_folder)
|
226 |
|
227 |
+
reduce_and_convert(input_folder)
|
228 |
|
229 |
return output_zip
|
230 |
|