import gradio as gr import pandas as pd import fitz # PyMuPDF import os from huggingface_hub import HfApi, HfHubHTTPError def extract_paragraphs_with_headers(pdf_path): doc = fitz.open(pdf_path) data = [] for page_num, page in enumerate(doc): blocks = page.get_text("dict")["blocks"] for block in blocks: if "lines" in block: text = "" for line in block["lines"]: for span in line["spans"]: text += span["text"] + " " text = text.strip() # Detect headers based on font size is_header = any(span["size"] > 15 for line in block["lines"] for span in line["spans"]) data.append({ "page_num": page_num + 1, "text": text, "is_header": is_header }) return data def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice): all_data = [] for pdf_file in pdf_files: extracted_data = extract_paragraphs_with_headers(pdf_file.name) for item in extracted_data: all_data.append({ 'filename': os.path.basename(pdf_file.name), 'page_num': item['page_num'], 'text': item['text'], 'is_header': item['is_header'] }) # Convert to DataFrame df = pd.DataFrame(all_data) # Save as Parquet parquet_file = 'papers_with_headers.parquet' df.to_parquet(parquet_file, engine='pyarrow', index=False) upload_message = "" # Only upload if the user selects it if action_choice in ["Upload to Hugging Face", "Both"]: try: api = HfApi() api.set_access_token(hf_token) # Validate the user's repo try: api.repo_info(repo_id=dataset_repo_id, repo_type="dataset") repo_exists = True except HfHubHTTPError: repo_exists = False if repo_exists: api.upload_file( path_or_fileobj=parquet_file, path_in_repo='papers_with_headers.parquet', repo_id=dataset_repo_id, repo_type='dataset' ) upload_message = f"✅ Successfully uploaded to {dataset_repo_id}" else: upload_message = "❌ Dataset repo not found. Please check the repo ID." except Exception as e: upload_message = f"❌ Upload failed: {str(e)}" # Return the file for local download + upload status return parquet_file, upload_message # Gradio Interface iface = gr.Interface( fn=pdf_to_parquet_and_upload, inputs=[ gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"), gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"), gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"), gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally") ], outputs=[ gr.File(label="Download Parquet File"), gr.Textbox(label="Status") ], title="PDF to Parquet Converter with User-Controlled Upload", description="Upload your PDFs (drag & drop or search), convert them to Parquet, and upload to your own Hugging Face Dataset repo." ) iface.launch()