Spaces:

Jobey1
/

Convert-PDF-To-Parquet-With-paragraph-markers

Running

App Files Files Community

Jobey1 commited on Feb 26

Commit

9bea774

verified ·

1 Parent(s): b891f10

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -48

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from huggingface_hub.utils import HfHubHTTPError
 import time
 def extract_full_paper_with_labels(pdf_path, progress=None):
-    print(f"📝 Starting PDF Processing: {os.path.basename(pdf_path)}")
     doc = fitz.open(pdf_path)
     content = ""
@@ -28,7 +28,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
     # Regex patterns for detection
     doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
     year_pattern = r'\b(19|20)\d{2}\b'
-    code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{|}|;)"
     reference_keywords = ['reference', 'bibliography', 'sources']
     financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
@@ -122,65 +122,131 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
         "content": content
     }
-def process_pdf_file(pdf_file, api_key, repo_address):
-    if pdf_file is None:
-        return None, "No PDF file uploaded."
-    # Determine file path (Gradio returns a file object or dict)
-    file_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file['name']
-    result = extract_full_paper_with_labels(file_path)
-    # Convert the result dictionary to a DataFrame and write it as a parquet file.
-    df = pd.DataFrame([result])
-    base = os.path.splitext(result['filename'])[0]
-    parquet_filename = f"{base}.parquet"
-    df.to_parquet(parquet_filename, index=False)
-    repo_status = ""
-    if api_key and repo_address:
-        api = HfApi()
         try:
-            api.upload_file(
-                path_or_fileobj=parquet_filename,
-                path_in_repo=parquet_filename,
-                repo_id=repo_address,
-                token=api_key
-            )
-            repo_status = f"File uploaded to repo {repo_address} successfully."
         except Exception as e:
-            repo_status = f"Failed to upload to repo: {str(e)}"
-    else:
-        repo_status = "API key or repo address not provided, skipping repo upload."
-    return parquet_filename, repo_status
-# Clear only file-related inputs/outputs, preserving API key and repo address.
-def clear_files():
-    return None, None, ""
-# Gradio interface setup
 with gr.Blocks() as demo:
     with gr.Row():
-        api_key_input = gr.Textbox(label="API Key", placeholder="Enter API Key")
-        repo_address_input = gr.Textbox(label="Repo Address", placeholder="Enter Repo Address")
     with gr.Row():
-        pdf_file_input = gr.File(label="Upload PDF")
-        convert_button = gr.Button("Convert to Parquet")
-        clear_button = gr.Button("Clear Files")
     with gr.Row():
-        download_file_output = gr.File(label="Download Parquet File")
-        repo_status_output = gr.Textbox(label="Repo Upload Status")
     convert_button.click(
-        process_pdf_file,
-        inputs=[pdf_file_input, api_key_input, repo_address_input],
-        outputs=[download_file_output, repo_status_output]
     )
-    clear_button.click(
-        clear_files,
         inputs=None,
-        outputs=[pdf_file_input, download_file_output, repo_status_output]
     )
 demo.launch()

 import time
 def extract_full_paper_with_labels(pdf_path, progress=None):
+    print(f"📄 Starting PDF Processing: {os.path.basename(pdf_path)}")
     doc = fitz.open(pdf_path)
     content = ""
     # Regex patterns for detection
     doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
     year_pattern = r'\b(19|20)\d{2}\b'
+    code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{|\}|;)"
     reference_keywords = ['reference', 'bibliography', 'sources']
     financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
         "content": content
     }
+def upload_with_progress(file_path, repo_id, token, progress):
+    """
+    Upload file to Hugging Face Dataset using upload_file() API method.
+    """
+    print(f"📤 Starting upload of Parquet: {file_path}")
+    file_size = os.path.getsize(file_path)
+    api = HfApi()
+    try:
+        # Use upload_file() method from huggingface_hub
+        api.upload_file(
+            path_or_fileobj=file_path,
+            path_in_repo=os.path.basename(file_path),
+            repo_id=repo_id,
+            repo_type="dataset",
+            token=token
+        )
+        if progress is not None:
+            progress(1, desc="✅ Upload Complete")
+        print(f"✅ Successfully uploaded to {repo_id}")
+        return f"✅ Successfully uploaded to {repo_id}"
+    except HfHubHTTPError as e:
+        print(f"❌ Upload failed: {e}")
+        return f"❌ Upload failed: {str(e)}"
+    except Exception as e:
+        print(f"❌ Unexpected error: {e}")
+        return f"❌ Unexpected error: {str(e)}"
+def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
+    all_data = []
+    total_files = len(pdf_files)
+    print("🚀 Starting PDF to Parquet Conversion Process")
+    for idx, pdf_file in enumerate(pdf_files):
+        if progress is not None:
+            progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
+        # ✅ Step 1: Process PDF with Full Labels
+        extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
+        all_data.append(extracted_data)
+    print("🟡 Converting Processed Data to Parquet")
+    # ✅ Step 2: Convert to Parquet
+    df = pd.DataFrame(all_data)
+    parquet_file = 'fully_labeled_papers.parquet'
+    try:
+        df.to_parquet(parquet_file, engine='pyarrow', index=False)
+        print("✅ Parquet Conversion Completed")
+    except Exception as e:
+        print(f"❌ Parquet Conversion Failed: {str(e)}")
+        return None, f"❌ Parquet Conversion Failed: {str(e)}"
+    upload_message = "Skipped Upload"
+    # ✅ Step 3: Upload Parquet (if selected)
+    if action_choice in ["Upload to Hugging Face", "Both"]:
         try:
+            upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
         except Exception as e:
+            print(f"❌ Upload Failed: {str(e)}")
+            upload_message = f"❌ Upload failed: {str(e)}"
+    print("🏁 Process Completed")
+    return parquet_file, upload_message
+# Define a function for our custom "Reset Files Only" button.
+def reset_files_fn():
+    # Return None for both the file input and the output file, clearing them.
+    return None, None
 with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        # PDF to Parquet Converter with Full Labeling
+        **Clear All Inputs:** The button below (labeled "Clear All Inputs") will reset every field, including your API key and dataset repo ID.
+        **Reset Files Only:** Use this button if you want to clear the PDF file uploads and the generated Parquet file, while keeping your credentials intact.
+        """
+    )
+    with gr.Row():
+        pdf_input = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)")
     with gr.Row():
+        hf_token = gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token")
+        dataset_repo = gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset")
     with gr.Row():
+        action_radio = gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
     with gr.Row():
+        convert_button = gr.Button("Convert PDF to Parquet")
+        reset_files_button = gr.Button("Reset Files Only")
+        clear_all_button = gr.Button("Clear All Inputs")
+    with gr.Row():
+        output_file = gr.File(label="Download Parquet File")
+        status_text = gr.Textbox(label="Status")
     convert_button.click(
+        fn=pdf_to_parquet_and_upload,
+        inputs=[pdf_input, hf_token, dataset_repo, action_radio],
+        outputs=[output_file, status_text]
     )
+    reset_files_button.click(
+        fn=reset_files_fn,
         inputs=None,
+        outputs=[pdf_input, output_file]
+    )
+    # The Clear All button resets every input field.
+    def clear_all_fn():
+        return None, None, None, "Download Locally"
+    clear_all_button.click(
+        fn=clear_all_fn,
+        inputs=None,
+        outputs=[pdf_input, hf_token, dataset_repo, action_radio]
     )
 demo.launch()