Spaces:

Jobey1
/

Convert-PDF-To-Parquet-With-paragraph-markers

Sleeping

App Files Files Community

Jobey1 commited on Feb 25

Commit

dfa54c4

verified ·

1 Parent(s): 06449e7

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -36

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ import fitz  # PyMuPDF
 import os
 from huggingface_hub import HfApi
 from huggingface_hub.utils import HfHubHTTPError
-import requests
 import time
 def extract_paragraphs_with_headers(pdf_path, progress=None):
@@ -48,47 +47,35 @@ def extract_paragraphs_with_headers(pdf_path, progress=None):
 def upload_with_progress(file_path, repo_id, token, progress):
     """
-    Upload file to Hugging Face Dataset with progress tracking.
     """
     print(f"📤 Starting upload of Parquet: {file_path}")
     file_size = os.path.getsize(file_path)
-    url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
-    headers = {
-        "Authorization": f"Bearer {token}"
-    }
-    with open(file_path, 'rb') as f:
-        chunk_size = 1024 * 1024  # 1MB
-        uploaded = 0
-        max_chunks = file_size // chunk_size + 10  # Safety limit to avoid infinite loops
-        chunk_count = 0
-        while True:
-            chunk = f.read(chunk_size)
-            if not chunk:
-                break
-            response = requests.put(
-                url,
-                headers=headers,
-                data=chunk
-            )
-            uploaded += len(chunk)
-            if progress is not None:
-                progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
-            time.sleep(0.1)  # Smooth progress update
-            chunk_count += 1
-            if chunk_count > max_chunks:
-                raise Exception("⚠️ Upload exceeded expected chunk limit. Aborting.")
-            if response.status_code != 200:
-                raise Exception(f"❌ Upload failed: {response.text}")
-    print(f"✅ Successfully uploaded to {repo_id}")
-    return f"✅ Successfully uploaded to {repo_id}"
 def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
     all_data = []
@@ -148,8 +135,8 @@ iface = gr.Interface(
         gr.File(label="Download Parquet File"),
         gr.Textbox(label="Status")
     ],
-    title="PDF to Parquet Converter with Detailed Progress",
-    description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset with clear progress indicators."
 )
 iface.launch()

 import os
 from huggingface_hub import HfApi
 from huggingface_hub.utils import HfHubHTTPError
 import time
 def extract_paragraphs_with_headers(pdf_path, progress=None):
 def upload_with_progress(file_path, repo_id, token, progress):
     """
+    Upload file to Hugging Face Dataset using upload_file() API method.
     """
     print(f"📤 Starting upload of Parquet: {file_path}")
     file_size = os.path.getsize(file_path)
+    api = HfApi()
+    try:
+        # Use upload_file() method from huggingface_hub
+        api.upload_file(
+            path_or_fileobj=file_path,
+            path_in_repo=os.path.basename(file_path),
+            repo_id=repo_id,
+            repo_type="dataset",
+            token=token
+        )
+        if progress is not None:
+            progress(1, desc="✅ Upload Complete")
+        print(f"✅ Successfully uploaded to {repo_id}")
+        return f"✅ Successfully uploaded to {repo_id}"
+    except HfHubHTTPError as e:
+        print(f"❌ Upload failed: {e}")
+        return f"❌ Upload failed: {str(e)}"
+    except Exception as e:
+        print(f"❌ Unexpected error: {e}")
+        return f"❌ Unexpected error: {str(e)}"
 def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
     all_data = []
         gr.File(label="Download Parquet File"),
         gr.Textbox(label="Status")
     ],
+    title="PDF to Parquet Converter with Correct Upload API",
+    description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset using the official API."
 )
 iface.launch()