Update app.py
Browse files
app.py
CHANGED
|
@@ -4,7 +4,6 @@ import fitz # PyMuPDF
|
|
| 4 |
import os
|
| 5 |
from huggingface_hub import HfApi
|
| 6 |
from huggingface_hub.utils import HfHubHTTPError
|
| 7 |
-
import requests
|
| 8 |
import time
|
| 9 |
|
| 10 |
def extract_paragraphs_with_headers(pdf_path, progress=None):
|
|
@@ -48,47 +47,35 @@ def extract_paragraphs_with_headers(pdf_path, progress=None):
|
|
| 48 |
|
| 49 |
def upload_with_progress(file_path, repo_id, token, progress):
|
| 50 |
"""
|
| 51 |
-
Upload file to Hugging Face Dataset
|
| 52 |
"""
|
| 53 |
print(f"π€ Starting upload of Parquet: {file_path}")
|
| 54 |
file_size = os.path.getsize(file_path)
|
| 55 |
-
url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
|
| 56 |
|
| 57 |
-
|
| 58 |
-
"Authorization": f"Bearer {token}"
|
| 59 |
-
}
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
break
|
| 71 |
-
|
| 72 |
-
response = requests.put(
|
| 73 |
-
url,
|
| 74 |
-
headers=headers,
|
| 75 |
-
data=chunk
|
| 76 |
-
)
|
| 77 |
-
|
| 78 |
-
uploaded += len(chunk)
|
| 79 |
-
if progress is not None:
|
| 80 |
-
progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
|
| 81 |
-
time.sleep(0.1) # Smooth progress update
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
raise Exception("β οΈ Upload exceeded expected chunk limit. Aborting.")
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
|
| 94 |
all_data = []
|
|
@@ -148,8 +135,8 @@ iface = gr.Interface(
|
|
| 148 |
gr.File(label="Download Parquet File"),
|
| 149 |
gr.Textbox(label="Status")
|
| 150 |
],
|
| 151 |
-
title="PDF to Parquet Converter with
|
| 152 |
-
description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset
|
| 153 |
)
|
| 154 |
|
| 155 |
iface.launch()
|
|
|
|
| 4 |
import os
|
| 5 |
from huggingface_hub import HfApi
|
| 6 |
from huggingface_hub.utils import HfHubHTTPError
|
|
|
|
| 7 |
import time
|
| 8 |
|
| 9 |
def extract_paragraphs_with_headers(pdf_path, progress=None):
|
|
|
|
| 47 |
|
| 48 |
def upload_with_progress(file_path, repo_id, token, progress):
|
| 49 |
"""
|
| 50 |
+
Upload file to Hugging Face Dataset using upload_file() API method.
|
| 51 |
"""
|
| 52 |
print(f"π€ Starting upload of Parquet: {file_path}")
|
| 53 |
file_size = os.path.getsize(file_path)
|
|
|
|
| 54 |
|
| 55 |
+
api = HfApi()
|
|
|
|
|
|
|
| 56 |
|
| 57 |
+
try:
|
| 58 |
+
# Use upload_file() method from huggingface_hub
|
| 59 |
+
api.upload_file(
|
| 60 |
+
path_or_fileobj=file_path,
|
| 61 |
+
path_in_repo=os.path.basename(file_path),
|
| 62 |
+
repo_id=repo_id,
|
| 63 |
+
repo_type="dataset",
|
| 64 |
+
token=token
|
| 65 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
if progress is not None:
|
| 68 |
+
progress(1, desc="β
Upload Complete")
|
|
|
|
| 69 |
|
| 70 |
+
print(f"β
Successfully uploaded to {repo_id}")
|
| 71 |
+
return f"β
Successfully uploaded to {repo_id}"
|
| 72 |
|
| 73 |
+
except HfHubHTTPError as e:
|
| 74 |
+
print(f"β Upload failed: {e}")
|
| 75 |
+
return f"β Upload failed: {str(e)}"
|
| 76 |
+
except Exception as e:
|
| 77 |
+
print(f"β Unexpected error: {e}")
|
| 78 |
+
return f"β Unexpected error: {str(e)}"
|
| 79 |
|
| 80 |
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
|
| 81 |
all_data = []
|
|
|
|
| 135 |
gr.File(label="Download Parquet File"),
|
| 136 |
gr.Textbox(label="Status")
|
| 137 |
],
|
| 138 |
+
title="PDF to Parquet Converter with Correct Upload API",
|
| 139 |
+
description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset using the official API."
|
| 140 |
)
|
| 141 |
|
| 142 |
iface.launch()
|