Jobey1 commited on
Commit
c2fa3ae
Β·
verified Β·
1 Parent(s): 9323459

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -35
app.py CHANGED
@@ -128,43 +128,37 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
128
  "content": content
129
  }
130
 
131
- def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
132
- upload_message = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
- total_files = len(pdf_files)
135
- print("πŸš€ Starting PDF to Parquet Conversion Process")
136
-
137
- for idx, pdf_file in enumerate(pdf_files):
138
  if progress is not None:
139
- progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
140
-
141
- # βœ… Step 1: Process PDF with Full Labels
142
- extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
143
-
144
- # βœ… Step 2: Use Title for Parquet Filename
145
- sanitized_title = sanitize_filename(extracted_data["title"])
146
- parquet_file = f"{sanitized_title}.parquet"
147
-
148
- # Convert to DataFrame
149
- df = pd.DataFrame([extracted_data])
150
-
151
- try:
152
- df.to_parquet(parquet_file, engine='pyarrow', index=False)
153
- print(f"βœ… Parquet saved as: {parquet_file}")
154
- except Exception as e:
155
- print(f"❌ Parquet Conversion Failed: {str(e)}")
156
- return None, f"❌ Parquet Conversion Failed: {str(e)}"
157
-
158
- # βœ… Step 3: Upload Parquet (if selected)
159
- if action_choice in ["Upload to Hugging Face", "Both"]:
160
- try:
161
- upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
162
- except Exception as e:
163
- print(f"❌ Upload Failed: {str(e)}")
164
- upload_message = f"❌ Upload failed: {str(e)}"
165
-
166
- print("🏁 Process Completed")
167
- return parquet_file, upload_message
168
 
169
  # βœ… Gradio Interface
170
  iface = gr.Interface(
 
128
  "content": content
129
  }
130
 
131
+ def upload_with_progress(file_path, repo_id, token, progress):
132
+ """
133
+ Upload file to Hugging Face Dataset using upload_file() API method.
134
+ """
135
+ print(f"πŸ“€ Starting upload of Parquet: {file_path}")
136
+ file_size = os.path.getsize(file_path)
137
+
138
+ api = HfApi()
139
+
140
+ try:
141
+ # Use upload_file() method from huggingface_hub
142
+ api.upload_file(
143
+ path_or_fileobj=file_path,
144
+ path_in_repo=os.path.basename(file_path),
145
+ repo_id=repo_id,
146
+ repo_type="dataset",
147
+ token=token
148
+ )
149
 
 
 
 
 
150
  if progress is not None:
151
+ progress(1, desc="βœ… Upload Complete")
152
+
153
+ print(f"βœ… Successfully uploaded to {repo_id}")
154
+ return f"βœ… Successfully uploaded to {repo_id}"
155
+
156
+ except HfHubHTTPError as e:
157
+ print(f"❌ Upload failed: {e}")
158
+ return f"❌ Upload failed: {str(e)}"
159
+ except Exception as e:
160
+ print(f"❌ Unexpected error: {e}")
161
+ return f"❌ Unexpected error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  # βœ… Gradio Interface
164
  iface = gr.Interface(