Jobey1 commited on
Commit
081cec0
Β·
verified Β·
1 Parent(s): c2fa3ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -27
app.py CHANGED
@@ -128,37 +128,44 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
128
  "content": content
129
  }
130
 
131
- def upload_with_progress(file_path, repo_id, token, progress):
132
- """
133
- Upload file to Hugging Face Dataset using upload_file() API method.
134
- """
135
- print(f"πŸ“€ Starting upload of Parquet: {file_path}")
136
- file_size = os.path.getsize(file_path)
137
-
138
- api = HfApi()
139
-
140
- try:
141
- # Use upload_file() method from huggingface_hub
142
- api.upload_file(
143
- path_or_fileobj=file_path,
144
- path_in_repo=os.path.basename(file_path),
145
- repo_id=repo_id,
146
- repo_type="dataset",
147
- token=token
148
- )
149
 
 
 
 
 
150
  if progress is not None:
151
- progress(1, desc="βœ… Upload Complete")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
- print(f"βœ… Successfully uploaded to {repo_id}")
154
- return f"βœ… Successfully uploaded to {repo_id}"
155
 
156
- except HfHubHTTPError as e:
157
- print(f"❌ Upload failed: {e}")
158
- return f"❌ Upload failed: {str(e)}"
159
- except Exception as e:
160
- print(f"❌ Unexpected error: {e}")
161
- return f"❌ Unexpected error: {str(e)}"
162
 
163
  # βœ… Gradio Interface
164
  iface = gr.Interface(
 
128
  "content": content
129
  }
130
 
131
+ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
132
+ upload_message = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
+ total_files = len(pdf_files)
135
+ print("πŸš€ Starting PDF to Parquet Conversion Process")
136
+
137
+ for idx, pdf_file in enumerate(pdf_files):
138
  if progress is not None:
139
+ progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
140
+
141
+ # βœ… Step 1: Process PDF with Full Labels
142
+ extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
143
+
144
+ # βœ… Step 2: Use Title for Parquet Filename
145
+ sanitized_title = sanitize_filename(extracted_data["title"])
146
+ parquet_file = f"{sanitized_title}.parquet"
147
+
148
+ # Convert to DataFrame
149
+ df = pd.DataFrame([extracted_data])
150
+
151
+ try:
152
+ df.to_parquet(parquet_file, engine='pyarrow', index=False)
153
+ print(f"βœ… Parquet saved as: {parquet_file}")
154
+ except Exception as e:
155
+ print(f"❌ Parquet Conversion Failed: {str(e)}")
156
+ return None, f"❌ Parquet Conversion Failed: {str(e)}"
157
+
158
+ # βœ… Step 3: Upload Parquet (if selected)
159
+ if action_choice in ["Upload to Hugging Face", "Both"]:
160
+ try:
161
+ upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
162
+ except Exception as e:
163
+ print(f"❌ Upload Failed: {str(e)}")
164
+ upload_message = f"❌ Upload failed: {str(e)}"
165
 
166
+ print("🏁 Process Completed")
167
+ return parquet_file, upload_message
168
 
 
 
 
 
 
 
169
 
170
  # βœ… Gradio Interface
171
  iface = gr.Interface(