Spaces:

Jobey1
/

Convert-PDF-To-Parquet-With-paragraph-markers

Sleeping

App Files Files Community

Jobey1 commited on Feb 25

Commit

7998543

verified ·

1 Parent(s): fb7ac68

Update app.py

Browse files

Added progress bar for upload. Trying to fix issue with outdated code that exports file to data set.

Files changed (1) hide show

app.py +52 -32

app.py CHANGED Viewed

@@ -2,15 +2,19 @@ import gradio as gr
 import pandas as pd
 import fitz  # PyMuPDF
 import os
-from huggingface_hub import HfApi
-from huggingface_hub.utils import HfHubHTTPError
-def extract_paragraphs_with_headers(pdf_path):
     doc = fitz.open(pdf_path)
     data = []
     for page_num, page in enumerate(doc):
         blocks = page.get_text("dict")["blocks"]
         for block in blocks:
             if "lines" in block:
@@ -18,7 +22,7 @@ def extract_paragraphs_with_headers(pdf_path):
                 for line in block["lines"]:
                     for span in line["spans"]:
                         text += span["text"] + " "
                 text = text.strip()
                 # Detect headers based on font size
@@ -32,12 +36,47 @@ def extract_paragraphs_with_headers(pdf_path):
     return data
-def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice):
     all_data = []
     for pdf_file in pdf_files:
-        extracted_data = extract_paragraphs_with_headers(pdf_file.name)
         for item in extracted_data:
             all_data.append({
                 'filename': os.path.basename(pdf_file.name),
@@ -58,31 +97,11 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
     # Only upload if the user selects it
     if action_choice in ["Upload to Hugging Face", "Both"]:
         try:
-            api = HfApi()
-            api.set_access_token(hf_token)
-            # Validate the user's repo
-            try:
-                api.repo_info(repo_id=dataset_repo_id, repo_type="dataset")
-                repo_exists = True
-            except HfHubHTTPError:
-                repo_exists = False
-            if repo_exists:
-                api.upload_file(
-                    path_or_fileobj=parquet_file,
-                    path_in_repo='papers_with_headers.parquet',
-                    repo_id=dataset_repo_id,
-                    repo_type='dataset'
-                )
-                upload_message = f"✅ Successfully uploaded to {dataset_repo_id}"
-            else:
-                upload_message = "❌ Dataset repo not found. Please check the repo ID."
         except Exception as e:
             upload_message = f"❌ Upload failed: {str(e)}"
-    # Return the file for local download + upload status
     return parquet_file, upload_message
 # Gradio Interface
@@ -98,8 +117,9 @@ iface = gr.Interface(
         gr.File(label="Download Parquet File"),
         gr.Textbox(label="Status")
     ],
-    title="PDF to Parquet Converter with User-Controlled Upload",
-    description="Upload your PDFs (drag & drop or search), convert them to Parquet, and upload to your own Hugging Face Dataset repo."
 )
 iface.launch()

 import pandas as pd
 import fitz  # PyMuPDF
 import os
+from huggingface_hub import HfApi, HfHubHTTPError
+import requests
+import time
+def extract_paragraphs_with_headers(pdf_path, progress=None):
     doc = fitz.open(pdf_path)
     data = []
+    total_pages = len(doc)
     for page_num, page in enumerate(doc):
+        if progress:
+            progress((page_num + 1) / total_pages, desc=f"Processing Page {page_num + 1}/{total_pages}")
         blocks = page.get_text("dict")["blocks"]
         for block in blocks:
             if "lines" in block:
                 for line in block["lines"]:
                     for span in line["spans"]:
                         text += span["text"] + " "
                 text = text.strip()
                 # Detect headers based on font size
     return data
+def upload_with_progress(file_path, repo_id, token, progress):
+    """
+    Upload file to Hugging Face Dataset with progress tracking.
+    """
+    file_size = os.path.getsize(file_path)
+    url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
+    headers = {
+        "Authorization": f"Bearer {token}"
+    }
+    with open(file_path, 'rb') as f:
+        chunk_size = 1024 * 1024  # 1MB
+        uploaded = 0
+        while True:
+            chunk = f.read(chunk_size)
+            if not chunk:
+                break
+            response = requests.put(
+                url,
+                headers=headers,
+                data=chunk
+            )
+            uploaded += len(chunk)
+            progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
+            time.sleep(0.1)  # Simulate delay for progress update
+            if response.status_code != 200:
+                raise Exception(f"Upload failed: {response.text}")
+    return f"✅ Successfully uploaded to {repo_id}"
+def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
     all_data = []
+    # Process each uploaded PDF
     for pdf_file in pdf_files:
+        extracted_data = extract_paragraphs_with_headers(pdf_file.name, progress=progress)
         for item in extracted_data:
             all_data.append({
                 'filename': os.path.basename(pdf_file.name),
     # Only upload if the user selects it
     if action_choice in ["Upload to Hugging Face", "Both"]:
         try:
+            upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
         except Exception as e:
             upload_message = f"❌ Upload failed: {str(e)}"
+    # Return Parquet file and status message
     return parquet_file, upload_message
 # Gradio Interface
         gr.File(label="Download Parquet File"),
         gr.Textbox(label="Status")
     ],
+    title="PDF to Parquet Converter with Upload Progress",
+    description="Upload your PDFs (drag & drop or search), convert them to Parquet, and upload to your own Hugging Face Dataset repo with real-time progress tracking."
 )
 iface.launch()