Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,6 @@ import fitz # PyMuPDF
|
|
4 |
import os
|
5 |
from huggingface_hub import HfApi
|
6 |
from huggingface_hub.utils import HfHubHTTPError
|
7 |
-
import requests
|
8 |
import time
|
9 |
|
10 |
def extract_paragraphs_with_headers(pdf_path, progress=None):
|
@@ -48,47 +47,35 @@ def extract_paragraphs_with_headers(pdf_path, progress=None):
|
|
48 |
|
49 |
def upload_with_progress(file_path, repo_id, token, progress):
|
50 |
"""
|
51 |
-
Upload file to Hugging Face Dataset
|
52 |
"""
|
53 |
print(f"π€ Starting upload of Parquet: {file_path}")
|
54 |
file_size = os.path.getsize(file_path)
|
55 |
-
url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
|
56 |
|
57 |
-
|
58 |
-
"Authorization": f"Bearer {token}"
|
59 |
-
}
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
break
|
71 |
-
|
72 |
-
response = requests.put(
|
73 |
-
url,
|
74 |
-
headers=headers,
|
75 |
-
data=chunk
|
76 |
-
)
|
77 |
-
|
78 |
-
uploaded += len(chunk)
|
79 |
-
if progress is not None:
|
80 |
-
progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
|
81 |
-
time.sleep(0.1) # Smooth progress update
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
raise Exception("β οΈ Upload exceeded expected chunk limit. Aborting.")
|
86 |
|
87 |
-
|
88 |
-
|
89 |
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
92 |
|
93 |
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
|
94 |
all_data = []
|
@@ -148,8 +135,8 @@ iface = gr.Interface(
|
|
148 |
gr.File(label="Download Parquet File"),
|
149 |
gr.Textbox(label="Status")
|
150 |
],
|
151 |
-
title="PDF to Parquet Converter with
|
152 |
-
description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset
|
153 |
)
|
154 |
|
155 |
iface.launch()
|
|
|
4 |
import os
|
5 |
from huggingface_hub import HfApi
|
6 |
from huggingface_hub.utils import HfHubHTTPError
|
|
|
7 |
import time
|
8 |
|
9 |
def extract_paragraphs_with_headers(pdf_path, progress=None):
|
|
|
47 |
|
48 |
def upload_with_progress(file_path, repo_id, token, progress):
|
49 |
"""
|
50 |
+
Upload file to Hugging Face Dataset using upload_file() API method.
|
51 |
"""
|
52 |
print(f"π€ Starting upload of Parquet: {file_path}")
|
53 |
file_size = os.path.getsize(file_path)
|
|
|
54 |
|
55 |
+
api = HfApi()
|
|
|
|
|
56 |
|
57 |
+
try:
|
58 |
+
# Use upload_file() method from huggingface_hub
|
59 |
+
api.upload_file(
|
60 |
+
path_or_fileobj=file_path,
|
61 |
+
path_in_repo=os.path.basename(file_path),
|
62 |
+
repo_id=repo_id,
|
63 |
+
repo_type="dataset",
|
64 |
+
token=token
|
65 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
+
if progress is not None:
|
68 |
+
progress(1, desc="β
Upload Complete")
|
|
|
69 |
|
70 |
+
print(f"β
Successfully uploaded to {repo_id}")
|
71 |
+
return f"β
Successfully uploaded to {repo_id}"
|
72 |
|
73 |
+
except HfHubHTTPError as e:
|
74 |
+
print(f"β Upload failed: {e}")
|
75 |
+
return f"β Upload failed: {str(e)}"
|
76 |
+
except Exception as e:
|
77 |
+
print(f"β Unexpected error: {e}")
|
78 |
+
return f"β Unexpected error: {str(e)}"
|
79 |
|
80 |
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
|
81 |
all_data = []
|
|
|
135 |
gr.File(label="Download Parquet File"),
|
136 |
gr.Textbox(label="Status")
|
137 |
],
|
138 |
+
title="PDF to Parquet Converter with Correct Upload API",
|
139 |
+
description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset using the official API."
|
140 |
)
|
141 |
|
142 |
iface.launch()
|