Spaces:

Jobey1
/

Convert-PDF-To-Parquet-With-paragraph-markers

Sleeping

Convert-PDF-To-Parquet-With-paragraph-markers

File size: 3,604 Bytes

import gradio as gr
import pandas as pd
import fitz  # PyMuPDF
import os
from huggingface_hub import HfApi, HfHubHTTPError

def extract_paragraphs_with_headers(pdf_path):
    doc = fitz.open(pdf_path)
    data = []

    for page_num, page in enumerate(doc):
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:
                text = ""
                for line in block["lines"]:
                    for span in line["spans"]:
                        text += span["text"] + " "
                
                text = text.strip()

                # Detect headers based on font size
                is_header = any(span["size"] > 15 for line in block["lines"] for span in line["spans"])

                data.append({
                    "page_num": page_num + 1,
                    "text": text,
                    "is_header": is_header
                })

    return data

def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice):
    all_data = []

    for pdf_file in pdf_files:
        extracted_data = extract_paragraphs_with_headers(pdf_file.name)

        for item in extracted_data:
            all_data.append({
                'filename': os.path.basename(pdf_file.name),
                'page_num': item['page_num'],
                'text': item['text'],
                'is_header': item['is_header']
            })

    # Convert to DataFrame
    df = pd.DataFrame(all_data)

    # Save as Parquet
    parquet_file = 'papers_with_headers.parquet'
    df.to_parquet(parquet_file, engine='pyarrow', index=False)

    upload_message = ""

    # Only upload if the user selects it
    if action_choice in ["Upload to Hugging Face", "Both"]:
        try:
            api = HfApi()
            api.set_access_token(hf_token)

            # Validate the user's repo
            try:
                api.repo_info(repo_id=dataset_repo_id, repo_type="dataset")
                repo_exists = True
            except HfHubHTTPError:
                repo_exists = False

            if repo_exists:
                api.upload_file(
                    path_or_fileobj=parquet_file,
                    path_in_repo='papers_with_headers.parquet',
                    repo_id=dataset_repo_id,
                    repo_type='dataset'
                )
                upload_message = f"✅ Successfully uploaded to {dataset_repo_id}"
            else:
                upload_message = "❌ Dataset repo not found. Please check the repo ID."

        except Exception as e:
            upload_message = f"❌ Upload failed: {str(e)}"

    # Return the file for local download + upload status
    return parquet_file, upload_message

# Gradio Interface
iface = gr.Interface(
    fn=pdf_to_parquet_and_upload,
    inputs=[
        gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"),
        gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"),
        gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"),
        gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
    ],
    outputs=[
        gr.File(label="Download Parquet File"), 
        gr.Textbox(label="Status")
    ],
    title="PDF to Parquet Converter with User-Controlled Upload",
    description="Upload your PDFs (drag & drop or search), convert them to Parquet, and upload to your own Hugging Face Dataset repo."
)

iface.launch()