Update app.py
Browse files
app.py
CHANGED
@@ -127,27 +127,42 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
|
|
127 |
|
128 |
def upload_with_progress(file_path, repo_id, token, progress):
|
129 |
"""
|
130 |
-
Upload file to Hugging Face Dataset
|
131 |
"""
|
132 |
-
from huggingface_hub import HfApi
|
133 |
-
from huggingface_hub.utils import HfHubHTTPError
|
134 |
-
import os
|
135 |
|
136 |
print(f"π€ Starting upload of Parquet: {file_path}")
|
137 |
file_size = os.path.getsize(file_path)
|
138 |
-
|
139 |
api = HfApi()
|
140 |
|
|
|
|
|
|
|
141 |
try:
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
|
|
|
|
|
|
|
|
150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
if progress is not None:
|
152 |
progress(1, desc="β
Upload Complete")
|
153 |
|
@@ -162,6 +177,7 @@ def upload_with_progress(file_path, repo_id, token, progress):
|
|
162 |
return f"β Unexpected error: {str(e)}"
|
163 |
|
164 |
|
|
|
165 |
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
|
166 |
upload_message = ""
|
167 |
|
|
|
127 |
|
128 |
def upload_with_progress(file_path, repo_id, token, progress):
|
129 |
"""
|
130 |
+
Upload file to Hugging Face Dataset with progress tracking.
|
131 |
"""
|
|
|
|
|
|
|
132 |
|
133 |
print(f"π€ Starting upload of Parquet: {file_path}")
|
134 |
file_size = os.path.getsize(file_path)
|
|
|
135 |
api = HfApi()
|
136 |
|
137 |
+
# Get the proper upload URL from the Hugging Face API
|
138 |
+
upload_url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
|
139 |
+
|
140 |
try:
|
141 |
+
with open(file_path, 'rb') as f:
|
142 |
+
chunk_size = 1024 * 1024 # 1 MB chunks
|
143 |
+
uploaded = 0
|
144 |
+
|
145 |
+
headers = {
|
146 |
+
"Authorization": f"Bearer {token}",
|
147 |
+
"Content-Type": "application/octet-stream"
|
148 |
+
}
|
149 |
+
|
150 |
+
while True:
|
151 |
+
chunk = f.read(chunk_size)
|
152 |
+
if not chunk:
|
153 |
+
break # Finished reading file
|
154 |
|
155 |
+
response = requests.put(upload_url, headers=headers, data=chunk)
|
156 |
+
|
157 |
+
if response.status_code != 200:
|
158 |
+
raise Exception(f"Upload failed: {response.text}")
|
159 |
+
|
160 |
+
# Update progress after each chunk
|
161 |
+
uploaded += len(chunk)
|
162 |
+
if progress is not None:
|
163 |
+
progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
|
164 |
+
|
165 |
+
# Final progress update
|
166 |
if progress is not None:
|
167 |
progress(1, desc="β
Upload Complete")
|
168 |
|
|
|
177 |
return f"β Unexpected error: {str(e)}"
|
178 |
|
179 |
|
180 |
+
|
181 |
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
|
182 |
upload_message = ""
|
183 |
|