Spaces:

Jobey1
/

Convert-PDF-To-Parquet-With-paragraph-markers

Running

App Files Files Community

Jobey1 commited on Feb 25

Commit

06449e7

verified ·

1 Parent(s): ad0b1f7

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -17

app.py CHANGED Viewed

@@ -8,12 +8,20 @@ import requests
 import time
 def extract_paragraphs_with_headers(pdf_path, progress=None):
     doc = fitz.open(pdf_path)
     data = []
     total_pages = len(doc)
     for page_num, page in enumerate(doc):
-        if progress:
             progress((page_num + 1) / total_pages, desc=f"Processing Page {page_num + 1}/{total_pages}")
         blocks = page.get_text("dict")["blocks"]
@@ -35,12 +43,14 @@ def extract_paragraphs_with_headers(pdf_path, progress=None):
                     "is_header": is_header
                 })
     return data
 def upload_with_progress(file_path, repo_id, token, progress):
     """
     Upload file to Hugging Face Dataset with progress tracking.
     """
     file_size = os.path.getsize(file_path)
     url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
@@ -51,6 +61,8 @@ def upload_with_progress(file_path, repo_id, token, progress):
     with open(file_path, 'rb') as f:
         chunk_size = 1024 * 1024  # 1MB
         uploaded = 0
         while True:
             chunk = f.read(chunk_size)
@@ -64,19 +76,31 @@ def upload_with_progress(file_path, repo_id, token, progress):
             )
             uploaded += len(chunk)
-            progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
-            time.sleep(0.1)  # Simulate delay for progress update
             if response.status_code != 200:
-                raise Exception(f"Upload failed: {response.text}")
     return f"✅ Successfully uploaded to {repo_id}"
 def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
     all_data = []
-    # Process each uploaded PDF
-    for pdf_file in pdf_files:
         extracted_data = extract_paragraphs_with_headers(pdf_file.name, progress=progress)
         for item in extracted_data:
             all_data.append({
@@ -86,26 +110,32 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
                 'is_header': item['is_header']
             })
-    # Convert to DataFrame
     df = pd.DataFrame(all_data)
-    # Save as Parquet
     parquet_file = 'papers_with_headers.parquet'
-    df.to_parquet(parquet_file, engine='pyarrow', index=False)
-    upload_message = ""
-    # Only upload if the user selects it
     if action_choice in ["Upload to Hugging Face", "Both"]:
         try:
             upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
         except Exception as e:
             upload_message = f"❌ Upload failed: {str(e)}"
-    # Return Parquet file and status message
     return parquet_file, upload_message
-# Gradio Interface
 iface = gr.Interface(
     fn=pdf_to_parquet_and_upload,
     inputs=[
@@ -118,9 +148,8 @@ iface = gr.Interface(
         gr.File(label="Download Parquet File"),
         gr.Textbox(label="Status")
     ],
-    title="PDF to Parquet Converter with Upload Progress",
-    description="Upload your PDFs (drag & drop or search), convert them to Parquet, and upload to your own Hugging Face Dataset repo with real-time progress tracking."
 )
 iface.launch()

 import time
 def extract_paragraphs_with_headers(pdf_path, progress=None):
+    print(f"📄 Starting PDF Processing: {os.path.basename(pdf_path)}")
     doc = fitz.open(pdf_path)
     data = []
     total_pages = len(doc)
+    max_iterations = total_pages * 2  # To prevent infinite loops
+    iteration_count = 0
     for page_num, page in enumerate(doc):
+        iteration_count += 1
+        if iteration_count > max_iterations:
+            raise Exception("⚠️ PDF processing exceeded iteration limit. Possible malformed PDF.")
+        if progress is not None:
             progress((page_num + 1) / total_pages, desc=f"Processing Page {page_num + 1}/{total_pages}")
         blocks = page.get_text("dict")["blocks"]
                     "is_header": is_header
                 })
+    print(f"✅ Finished Processing PDF: {os.path.basename(pdf_path)}")
     return data
 def upload_with_progress(file_path, repo_id, token, progress):
     """
     Upload file to Hugging Face Dataset with progress tracking.
     """
+    print(f"📤 Starting upload of Parquet: {file_path}")
     file_size = os.path.getsize(file_path)
     url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
     with open(file_path, 'rb') as f:
         chunk_size = 1024 * 1024  # 1MB
         uploaded = 0
+        max_chunks = file_size // chunk_size + 10  # Safety limit to avoid infinite loops
+        chunk_count = 0
         while True:
             chunk = f.read(chunk_size)
             )
             uploaded += len(chunk)
+            if progress is not None:
+                progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
+            time.sleep(0.1)  # Smooth progress update
+            chunk_count += 1
+            if chunk_count > max_chunks:
+                raise Exception("⚠️ Upload exceeded expected chunk limit. Aborting.")
             if response.status_code != 200:
+                raise Exception(f"❌ Upload failed: {response.text}")
+    print(f"✅ Successfully uploaded to {repo_id}")
     return f"✅ Successfully uploaded to {repo_id}"
 def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
     all_data = []
+    total_files = len(pdf_files)
+    print("🚀 Starting PDF to Parquet Conversion Process")
+    for idx, pdf_file in enumerate(pdf_files):
+        if progress is not None:
+            progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
+        # ✅ Step 1: Process PDF
         extracted_data = extract_paragraphs_with_headers(pdf_file.name, progress=progress)
         for item in extracted_data:
             all_data.append({
                 'is_header': item['is_header']
             })
+    print("🟡 Converting Processed Data to Parquet")
+    # ✅ Step 2: Convert to Parquet
     df = pd.DataFrame(all_data)
     parquet_file = 'papers_with_headers.parquet'
+    try:
+        df.to_parquet(parquet_file, engine='pyarrow', index=False)
+        print("✅ Parquet Conversion Completed")
+    except Exception as e:
+        print(f"❌ Parquet Conversion Failed: {str(e)}")
+        return None, f"❌ Parquet Conversion Failed: {str(e)}"
+    upload_message = "Skipped Upload"
+    # ✅ Step 3: Upload Parquet (if selected)
     if action_choice in ["Upload to Hugging Face", "Both"]:
         try:
             upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
         except Exception as e:
+            print(f"❌ Upload Failed: {str(e)}")
             upload_message = f"❌ Upload failed: {str(e)}"
+    print("🏁 Process Completed")
     return parquet_file, upload_message
+# ✅ Gradio Interface
 iface = gr.Interface(
     fn=pdf_to_parquet_and_upload,
     inputs=[
         gr.File(label="Download Parquet File"),
         gr.Textbox(label="Status")
     ],
+    title="PDF to Parquet Converter with Detailed Progress",
+    description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset with clear progress indicators."
 )
 iface.launch()