Spaces:

Jobey1
/

Convert-PDF-To-Parquet-With-paragraph-markers

Sleeping

App Files Files Community

Jobey1 commited on Feb 26

Commit

c8cd30b

verified ·

1 Parent(s): 9d4e756

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -73

app.py CHANGED Viewed

@@ -6,8 +6,6 @@ import re
 from huggingface_hub import HfApi
 from huggingface_hub.utils import HfHubHTTPError
 import time
-import hashlib
-import requests
 def extract_full_paper_with_labels(pdf_path, progress=None):
     print(f"📄 Starting PDF Processing: {os.path.basename(pdf_path)}")
@@ -100,7 +98,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
                 elif re.search(r"=|∑|√|±|×|π|μ|σ", text):
                     content += f"<EQUATION>{text}</EQUATION>\n"
-                # Code Blocks (enhanced detection)
                 elif re.search(code_pattern, text) and len(text.split()) <= 50:
                     content += f"<CODE>{text}</CODE>\n"
@@ -121,49 +119,28 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
     print(f"✅ Finished Processing PDF: {os.path.basename(pdf_path)}")
     return {
         "filename": os.path.basename(pdf_path),
-        "title": title if title else "Untitled_Paper",
         "content": content
     }
 def upload_with_progress(file_path, repo_id, token, progress):
     """
-    Upload file to Hugging Face Dataset with progress tracking.
     """
     print(f"📤 Starting upload of Parquet: {file_path}")
     file_size = os.path.getsize(file_path)
-    api = HfApi()
-    # Get the proper upload URL from the Hugging Face API
-    upload_url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
     try:
-        with open(file_path, 'rb') as f:
-            chunk_size = 1024 * 1024  # 1 MB chunks
-            uploaded = 0
-            headers = {
-                "Authorization": f"Bearer {token}",
-                "Content-Type": "application/octet-stream"
-            }
-            while True:
-                chunk = f.read(chunk_size)
-                if not chunk:
-                    break  # Finished reading file
-                response = requests.put(upload_url, headers=headers, data=chunk)
-                if response.status_code != 200:
-                    raise Exception(f"Upload failed: {response.text}")
-                # Update progress after each chunk
-                uploaded += len(chunk)
-                if progress is not None:
-                    progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
-        # Final progress update
         if progress is not None:
             progress(1, desc="✅ Upload Complete")
@@ -177,27 +154,8 @@ def upload_with_progress(file_path, repo_id, token, progress):
         print(f"❌ Unexpected error: {e}")
         return f"❌ Unexpected error: {str(e)}"
 def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
-    upload_message = ""
-    # ✅ Helper function inside this block to avoid external edits
-    def sanitize_filename(title, max_length=100):
-        """
-        Sanitize and truncate the filename to avoid OS limits.
-        """
-        # Remove invalid characters
-        sanitized = re.sub(r'[\\/*?:"<>|]', "", title)
-        sanitized = sanitized.replace(" ", "_")
-        # Truncate to max_length if necessary
-        if len(sanitized) > max_length:
-            # Append an 8-character hash for uniqueness
-            hash_suffix = hashlib.md5(sanitized.encode()).hexdigest()[:8]
-            sanitized = sanitized[:max_length] + "_" + hash_suffix
-        return sanitized
     total_files = len(pdf_files)
     print("🚀 Starting PDF to Parquet Conversion Process")
@@ -208,28 +166,29 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
         # ✅ Step 1: Process PDF with Full Labels
         extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
-        # ✅ Step 2: Use Title for Parquet Filename with Truncation & Hash
-        sanitized_title = sanitize_filename(extracted_data["title"])
-        parquet_file = f"{sanitized_title}.parquet"
-        # Convert to DataFrame
-        df = pd.DataFrame([extracted_data])
         try:
-            df.to_parquet(parquet_file, engine='pyarrow', index=False)
-            print(f"✅ Parquet saved as: {parquet_file}")
         except Exception as e:
-            print(f"❌ Parquet Conversion Failed: {str(e)}")
-            return None, f"❌ Parquet Conversion Failed: {str(e)}"
-        # ✅ Step 3: Upload Parquet (if selected)
-        if action_choice in ["Upload to Hugging Face", "Both"]:
-            try:
-                upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
-            except Exception as e:
-                print(f"❌ Upload Failed: {str(e)}")
-                upload_message = f"❌ Upload failed: {str(e)}"
     print("🏁 Process Completed")
     return parquet_file, upload_message
@@ -247,10 +206,11 @@ iface = gr.Interface(
         gr.File(label="Download Parquet File"),
         gr.Textbox(label="Status")
     ],
-    title="PDF to Parquet Converter with Title-Based Naming",
-    description="Upload your PDFs, convert them to Parquet files named after the paper title, and upload to your Hugging Face Dataset."
 )
 iface.launch()

 from huggingface_hub import HfApi
 from huggingface_hub.utils import HfHubHTTPError
 import time
 def extract_full_paper_with_labels(pdf_path, progress=None):
     print(f"📄 Starting PDF Processing: {os.path.basename(pdf_path)}")
                 elif re.search(r"=|∑|√|±|×|π|μ|σ", text):
                     content += f"<EQUATION>{text}</EQUATION>\n"
+                # ✅ Improved Code Block Detection
                 elif re.search(code_pattern, text) and len(text.split()) <= 50:
                     content += f"<CODE>{text}</CODE>\n"
     print(f"✅ Finished Processing PDF: {os.path.basename(pdf_path)}")
     return {
         "filename": os.path.basename(pdf_path),
         "content": content
     }
 def upload_with_progress(file_path, repo_id, token, progress):
     """
+    Upload file to Hugging Face Dataset using upload_file() API method.
     """
     print(f"📤 Starting upload of Parquet: {file_path}")
     file_size = os.path.getsize(file_path)
+    api = HfApi()
     try:
+        # Use upload_file() method from huggingface_hub
+        api.upload_file(
+            path_or_fileobj=file_path,
+            path_in_repo=os.path.basename(file_path),
+            repo_id=repo_id,
+            repo_type="dataset",
+            token=token
+        )
         if progress is not None:
             progress(1, desc="✅ Upload Complete")
         print(f"❌ Unexpected error: {e}")
         return f"❌ Unexpected error: {str(e)}"
 def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
+    all_data = []
     total_files = len(pdf_files)
     print("🚀 Starting PDF to Parquet Conversion Process")
         # ✅ Step 1: Process PDF with Full Labels
         extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
+        all_data.append(extracted_data)
+    print("🟡 Converting Processed Data to Parquet")
+    # ✅ Step 2: Convert to Parquet
+    df = pd.DataFrame(all_data)
+    parquet_file = 'fully_labeled_papers.parquet'
+    try:
+        df.to_parquet(parquet_file, engine='pyarrow', index=False)
+        print("✅ Parquet Conversion Completed")
+    except Exception as e:
+        print(f"❌ Parquet Conversion Failed: {str(e)}")
+        return None, f"❌ Parquet Conversion Failed: {str(e)}"
+    upload_message = "Skipped Upload"
+    # ✅ Step 3: Upload Parquet (if selected)
+    if action_choice in ["Upload to Hugging Face", "Both"]:
         try:
+            upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
         except Exception as e:
+            print(f"❌ Upload Failed: {str(e)}")
+            upload_message = f"❌ Upload failed: {str(e)}"
     print("🏁 Process Completed")
     return parquet_file, upload_message
         gr.File(label="Download Parquet File"),
         gr.Textbox(label="Status")
     ],
+    title="PDF to Parquet Converter with Full Labeling",
+    description="Upload your PDFs, convert them to Parquet with full section labeling, and upload to your Hugging Face Dataset."
 )
 iface.launch()