Update app.py
Browse files
app.py
CHANGED
@@ -154,22 +154,20 @@ def upload_with_progress(file_path, repo_id, token, progress):
|
|
154 |
print(f"β Unexpected error: {e}")
|
155 |
return f"β Unexpected error: {str(e)}"
|
156 |
|
157 |
-
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
|
158 |
all_data = []
|
159 |
|
160 |
total_files = len(pdf_files)
|
161 |
print("π Starting PDF to Parquet Conversion Process")
|
162 |
|
163 |
for idx, pdf_file in enumerate(pdf_files):
|
164 |
-
if progress
|
165 |
progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
|
166 |
|
167 |
-
# β
Step 1: Process PDF with Full Labels
|
168 |
extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
|
169 |
all_data.append(extracted_data)
|
170 |
|
171 |
print("π‘ Converting Processed Data to Parquet")
|
172 |
-
# β
Step 2: Convert to Parquet
|
173 |
df = pd.DataFrame(all_data)
|
174 |
parquet_file = 'fully_labeled_papers.parquet'
|
175 |
|
@@ -178,11 +176,11 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
|
|
178 |
print("β
Parquet Conversion Completed")
|
179 |
except Exception as e:
|
180 |
print(f"β Parquet Conversion Failed: {str(e)}")
|
181 |
-
return None, f"β Parquet Conversion Failed: {str(e)}"
|
182 |
|
183 |
upload_message = "Skipped Upload"
|
184 |
|
185 |
-
# β
|
186 |
if action_choice in ["Upload to Hugging Face", "Both"]:
|
187 |
try:
|
188 |
upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
|
@@ -191,7 +189,13 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
|
|
191 |
upload_message = f"β Upload failed: {str(e)}"
|
192 |
|
193 |
print("π Process Completed")
|
194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
|
196 |
# β
Gradio Interface
|
197 |
iface = gr.Interface(
|
|
|
154 |
print(f"β Unexpected error: {e}")
|
155 |
return f"β Unexpected error: {str(e)}"
|
156 |
|
157 |
+
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, state, progress=gr.Progress()):
|
158 |
all_data = []
|
159 |
|
160 |
total_files = len(pdf_files)
|
161 |
print("π Starting PDF to Parquet Conversion Process")
|
162 |
|
163 |
for idx, pdf_file in enumerate(pdf_files):
|
164 |
+
if progress:
|
165 |
progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
|
166 |
|
|
|
167 |
extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
|
168 |
all_data.append(extracted_data)
|
169 |
|
170 |
print("π‘ Converting Processed Data to Parquet")
|
|
|
171 |
df = pd.DataFrame(all_data)
|
172 |
parquet_file = 'fully_labeled_papers.parquet'
|
173 |
|
|
|
176 |
print("β
Parquet Conversion Completed")
|
177 |
except Exception as e:
|
178 |
print(f"β Parquet Conversion Failed: {str(e)}")
|
179 |
+
return None, f"β Parquet Conversion Failed: {str(e)}", state
|
180 |
|
181 |
upload_message = "Skipped Upload"
|
182 |
|
183 |
+
# β
Upload Parquet if selected
|
184 |
if action_choice in ["Upload to Hugging Face", "Both"]:
|
185 |
try:
|
186 |
upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
|
|
|
189 |
upload_message = f"β Upload failed: {str(e)}"
|
190 |
|
191 |
print("π Process Completed")
|
192 |
+
|
193 |
+
# β
Clear Uploaded PDFs and Parquet File
|
194 |
+
if os.path.exists(parquet_file):
|
195 |
+
os.remove(parquet_file)
|
196 |
+
print("ποΈ Parquet file cleared after processing.")
|
197 |
+
|
198 |
+
return None, upload_message, state
|
199 |
|
200 |
# β
Gradio Interface
|
201 |
iface = gr.Interface(
|