Update app.py
Browse files
app.py
CHANGED
@@ -6,11 +6,7 @@ import re
|
|
6 |
from huggingface_hub import HfApi
|
7 |
from huggingface_hub.utils import HfHubHTTPError
|
8 |
import time
|
9 |
-
|
10 |
-
def sanitize_filename(title):
|
11 |
-
# Remove invalid characters and replace spaces with underscores
|
12 |
-
sanitized = re.sub(r'[\\/*?:"<>|]', "", title)
|
13 |
-
return sanitized.replace(" ", "_")
|
14 |
|
15 |
def extract_full_paper_with_labels(pdf_path, progress=None):
|
16 |
print(f"π Starting PDF Processing: {os.path.basename(pdf_path)}")
|
@@ -131,23 +127,45 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
|
|
131 |
|
132 |
def upload_with_progress(file_path, repo_id, token, progress):
|
133 |
"""
|
134 |
-
Upload file to Hugging Face Dataset
|
135 |
"""
|
|
|
136 |
print(f"π€ Starting upload of Parquet: {file_path}")
|
137 |
file_size = os.path.getsize(file_path)
|
138 |
|
139 |
api = HfApi()
|
140 |
|
141 |
try:
|
142 |
-
#
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
if progress is not None:
|
152 |
progress(1, desc="β
Upload Complete")
|
153 |
|
@@ -161,9 +179,27 @@ def upload_with_progress(file_path, repo_id, token, progress):
|
|
161 |
print(f"β Unexpected error: {e}")
|
162 |
return f"β Unexpected error: {str(e)}"
|
163 |
|
|
|
164 |
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
|
165 |
upload_message = ""
|
166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
total_files = len(pdf_files)
|
168 |
print("π Starting PDF to Parquet Conversion Process")
|
169 |
|
@@ -174,7 +210,7 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
|
|
174 |
# β
Step 1: Process PDF with Full Labels
|
175 |
extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
|
176 |
|
177 |
-
# β
Step 2: Use Title for Parquet Filename
|
178 |
sanitized_title = sanitize_filename(extracted_data["title"])
|
179 |
parquet_file = f"{sanitized_title}.parquet"
|
180 |
|
@@ -199,7 +235,6 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
|
|
199 |
print("π Process Completed")
|
200 |
return parquet_file, upload_message
|
201 |
|
202 |
-
|
203 |
# β
Gradio Interface
|
204 |
iface = gr.Interface(
|
205 |
fn=pdf_to_parquet_and_upload,
|
|
|
6 |
from huggingface_hub import HfApi
|
7 |
from huggingface_hub.utils import HfHubHTTPError
|
8 |
import time
|
9 |
+
import hashlib
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def extract_full_paper_with_labels(pdf_path, progress=None):
|
12 |
print(f"π Starting PDF Processing: {os.path.basename(pdf_path)}")
|
|
|
127 |
|
128 |
def upload_with_progress(file_path, repo_id, token, progress):
|
129 |
"""
|
130 |
+
Upload file to Hugging Face Dataset with progress tracking.
|
131 |
"""
|
132 |
+
import requests # Ensure this is imported if not already
|
133 |
print(f"π€ Starting upload of Parquet: {file_path}")
|
134 |
file_size = os.path.getsize(file_path)
|
135 |
|
136 |
api = HfApi()
|
137 |
|
138 |
try:
|
139 |
+
# Open the file in binary read mode
|
140 |
+
with open(file_path, 'rb') as f:
|
141 |
+
chunk_size = 1024 * 1024 # 1 MB chunks
|
142 |
+
uploaded = 0
|
143 |
+
|
144 |
+
# Prepare headers
|
145 |
+
headers = {
|
146 |
+
"Authorization": f"Bearer {token}"
|
147 |
+
}
|
148 |
+
|
149 |
+
# Construct upload URL
|
150 |
+
upload_url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
|
151 |
+
|
152 |
+
while True:
|
153 |
+
chunk = f.read(chunk_size)
|
154 |
+
if not chunk:
|
155 |
+
break # Finished reading file
|
156 |
+
|
157 |
+
# Upload chunk
|
158 |
+
response = requests.put(upload_url, headers=headers, data=chunk)
|
159 |
+
|
160 |
+
if response.status_code != 200:
|
161 |
+
raise Exception(f"Upload failed: {response.text}")
|
162 |
|
163 |
+
# Update progress
|
164 |
+
uploaded += len(chunk)
|
165 |
+
if progress is not None:
|
166 |
+
progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
|
167 |
+
|
168 |
+
# Final progress update
|
169 |
if progress is not None:
|
170 |
progress(1, desc="β
Upload Complete")
|
171 |
|
|
|
179 |
print(f"β Unexpected error: {e}")
|
180 |
return f"β Unexpected error: {str(e)}"
|
181 |
|
182 |
+
|
183 |
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
|
184 |
upload_message = ""
|
185 |
|
186 |
+
# β
Helper function inside this block to avoid external edits
|
187 |
+
def sanitize_filename(title, max_length=100):
|
188 |
+
"""
|
189 |
+
Sanitize and truncate the filename to avoid OS limits.
|
190 |
+
"""
|
191 |
+
# Remove invalid characters
|
192 |
+
sanitized = re.sub(r'[\\/*?:"<>|]', "", title)
|
193 |
+
sanitized = sanitized.replace(" ", "_")
|
194 |
+
|
195 |
+
# Truncate to max_length if necessary
|
196 |
+
if len(sanitized) > max_length:
|
197 |
+
# Append an 8-character hash for uniqueness
|
198 |
+
hash_suffix = hashlib.md5(sanitized.encode()).hexdigest()[:8]
|
199 |
+
sanitized = sanitized[:max_length] + "_" + hash_suffix
|
200 |
+
|
201 |
+
return sanitized
|
202 |
+
|
203 |
total_files = len(pdf_files)
|
204 |
print("π Starting PDF to Parquet Conversion Process")
|
205 |
|
|
|
210 |
# β
Step 1: Process PDF with Full Labels
|
211 |
extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
|
212 |
|
213 |
+
# β
Step 2: Use Title for Parquet Filename with Truncation & Hash
|
214 |
sanitized_title = sanitize_filename(extracted_data["title"])
|
215 |
parquet_file = f"{sanitized_title}.parquet"
|
216 |
|
|
|
235 |
print("π Process Completed")
|
236 |
return parquet_file, upload_message
|
237 |
|
|
|
238 |
# β
Gradio Interface
|
239 |
iface = gr.Interface(
|
240 |
fn=pdf_to_parquet_and_upload,
|