Spaces:

Jobey1
/

Convert-PDF-To-Parquet-With-paragraph-markers

Sleeping

App Files Files Community

Jobey1 commited on Feb 26

Commit

b7a9a73

verified ·

1 Parent(s): 01fcd9e

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -16

app.py CHANGED Viewed

@@ -6,11 +6,7 @@ import re
 from huggingface_hub import HfApi
 from huggingface_hub.utils import HfHubHTTPError
 import time
-def sanitize_filename(title):
-    # Remove invalid characters and replace spaces with underscores
-    sanitized = re.sub(r'[\\/*?:"<>|]', "", title)
-    return sanitized.replace(" ", "_")
 def extract_full_paper_with_labels(pdf_path, progress=None):
     print(f"📄 Starting PDF Processing: {os.path.basename(pdf_path)}")
@@ -131,23 +127,45 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
 def upload_with_progress(file_path, repo_id, token, progress):
     """
-    Upload file to Hugging Face Dataset using upload_file() API method.
     """
     print(f"📤 Starting upload of Parquet: {file_path}")
     file_size = os.path.getsize(file_path)
     api = HfApi()
     try:
-        # Use upload_file() method from huggingface_hub
-        api.upload_file(
-            path_or_fileobj=file_path,
-            path_in_repo=os.path.basename(file_path),
-            repo_id=repo_id,
-            repo_type="dataset",
-            token=token
-        )
         if progress is not None:
             progress(1, desc="✅ Upload Complete")
@@ -161,9 +179,27 @@ def upload_with_progress(file_path, repo_id, token, progress):
         print(f"❌ Unexpected error: {e}")
         return f"❌ Unexpected error: {str(e)}"
 def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
     upload_message = ""
     total_files = len(pdf_files)
     print("🚀 Starting PDF to Parquet Conversion Process")
@@ -174,7 +210,7 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
         # ✅ Step 1: Process PDF with Full Labels
         extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
-        # ✅ Step 2: Use Title for Parquet Filename
         sanitized_title = sanitize_filename(extracted_data["title"])
         parquet_file = f"{sanitized_title}.parquet"
@@ -199,7 +235,6 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
     print("🏁 Process Completed")
     return parquet_file, upload_message
 # ✅ Gradio Interface
 iface = gr.Interface(
     fn=pdf_to_parquet_and_upload,

 from huggingface_hub import HfApi
 from huggingface_hub.utils import HfHubHTTPError
 import time
+import hashlib
 def extract_full_paper_with_labels(pdf_path, progress=None):
     print(f"📄 Starting PDF Processing: {os.path.basename(pdf_path)}")
 def upload_with_progress(file_path, repo_id, token, progress):
     """
+    Upload file to Hugging Face Dataset with progress tracking.
     """
+    import requests  # Ensure this is imported if not already
     print(f"📤 Starting upload of Parquet: {file_path}")
     file_size = os.path.getsize(file_path)
     api = HfApi()
     try:
+        # Open the file in binary read mode
+        with open(file_path, 'rb') as f:
+            chunk_size = 1024 * 1024  # 1 MB chunks
+            uploaded = 0
+            # Prepare headers
+            headers = {
+                "Authorization": f"Bearer {token}"
+            }
+            # Construct upload URL
+            upload_url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
+            while True:
+                chunk = f.read(chunk_size)
+                if not chunk:
+                    break  # Finished reading file
+                # Upload chunk
+                response = requests.put(upload_url, headers=headers, data=chunk)
+                if response.status_code != 200:
+                    raise Exception(f"Upload failed: {response.text}")
+                # Update progress
+                uploaded += len(chunk)
+                if progress is not None:
+                    progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
+        # Final progress update
         if progress is not None:
             progress(1, desc="✅ Upload Complete")
         print(f"❌ Unexpected error: {e}")
         return f"❌ Unexpected error: {str(e)}"
 def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
     upload_message = ""
+    # ✅ Helper function inside this block to avoid external edits
+    def sanitize_filename(title, max_length=100):
+        """
+        Sanitize and truncate the filename to avoid OS limits.
+        """
+        # Remove invalid characters
+        sanitized = re.sub(r'[\\/*?:"<>|]', "", title)
+        sanitized = sanitized.replace(" ", "_")
+        # Truncate to max_length if necessary
+        if len(sanitized) > max_length:
+            # Append an 8-character hash for uniqueness
+            hash_suffix = hashlib.md5(sanitized.encode()).hexdigest()[:8]
+            sanitized = sanitized[:max_length] + "_" + hash_suffix
+        return sanitized
     total_files = len(pdf_files)
     print("🚀 Starting PDF to Parquet Conversion Process")
         # ✅ Step 1: Process PDF with Full Labels
         extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
+        # ✅ Step 2: Use Title for Parquet Filename with Truncation & Hash
         sanitized_title = sanitize_filename(extracted_data["title"])
         parquet_file = f"{sanitized_title}.parquet"
     print("🏁 Process Completed")
     return parquet_file, upload_message
 # ✅ Gradio Interface
 iface = gr.Interface(
     fn=pdf_to_parquet_and_upload,