import gradio as gr import pandas as pd import fitz # PyMuPDF import os from huggingface_hub import HfApi from huggingface_hub.utils import HfHubHTTPError import requests import time def extract_paragraphs_with_headers(pdf_path, progress=None): print(f"📄 Starting PDF Processing: {os.path.basename(pdf_path)}") doc = fitz.open(pdf_path) data = [] total_pages = len(doc) max_iterations = total_pages * 2 # To prevent infinite loops iteration_count = 0 for page_num, page in enumerate(doc): iteration_count += 1 if iteration_count > max_iterations: raise Exception("⚠️ PDF processing exceeded iteration limit. Possible malformed PDF.") if progress is not None: progress((page_num + 1) / total_pages, desc=f"Processing Page {page_num + 1}/{total_pages}") blocks = page.get_text("dict")["blocks"] for block in blocks: if "lines" in block: text = "" for line in block["lines"]: for span in line["spans"]: text += span["text"] + " " text = text.strip() # Detect headers based on font size is_header = any(span["size"] > 15 for line in block["lines"] for span in line["spans"]) data.append({ "page_num": page_num + 1, "text": text, "is_header": is_header }) print(f"✅ Finished Processing PDF: {os.path.basename(pdf_path)}") return data def upload_with_progress(file_path, repo_id, token, progress): """ Upload file to Hugging Face Dataset with progress tracking. """ print(f"📤 Starting upload of Parquet: {file_path}") file_size = os.path.getsize(file_path) url = f"https://huggingface.co/api/datasets/{repo_id}/upload" headers = { "Authorization": f"Bearer {token}" } with open(file_path, 'rb') as f: chunk_size = 1024 * 1024 # 1MB uploaded = 0 max_chunks = file_size // chunk_size + 10 # Safety limit to avoid infinite loops chunk_count = 0 while True: chunk = f.read(chunk_size) if not chunk: break response = requests.put( url, headers=headers, data=chunk ) uploaded += len(chunk) if progress is not None: progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB") time.sleep(0.1) # Smooth progress update chunk_count += 1 if chunk_count > max_chunks: raise Exception("⚠️ Upload exceeded expected chunk limit. Aborting.") if response.status_code != 200: raise Exception(f"❌ Upload failed: {response.text}") print(f"✅ Successfully uploaded to {repo_id}") return f"✅ Successfully uploaded to {repo_id}" def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()): all_data = [] total_files = len(pdf_files) print("🚀 Starting PDF to Parquet Conversion Process") for idx, pdf_file in enumerate(pdf_files): if progress is not None: progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}") # ✅ Step 1: Process PDF extracted_data = extract_paragraphs_with_headers(pdf_file.name, progress=progress) for item in extracted_data: all_data.append({ 'filename': os.path.basename(pdf_file.name), 'page_num': item['page_num'], 'text': item['text'], 'is_header': item['is_header'] }) print("🟡 Converting Processed Data to Parquet") # ✅ Step 2: Convert to Parquet df = pd.DataFrame(all_data) parquet_file = 'papers_with_headers.parquet' try: df.to_parquet(parquet_file, engine='pyarrow', index=False) print("✅ Parquet Conversion Completed") except Exception as e: print(f"❌ Parquet Conversion Failed: {str(e)}") return None, f"❌ Parquet Conversion Failed: {str(e)}" upload_message = "Skipped Upload" # ✅ Step 3: Upload Parquet (if selected) if action_choice in ["Upload to Hugging Face", "Both"]: try: upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress) except Exception as e: print(f"❌ Upload Failed: {str(e)}") upload_message = f"❌ Upload failed: {str(e)}" print("🏁 Process Completed") return parquet_file, upload_message # ✅ Gradio Interface iface = gr.Interface( fn=pdf_to_parquet_and_upload, inputs=[ gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"), gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"), gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"), gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally") ], outputs=[ gr.File(label="Download Parquet File"), gr.Textbox(label="Status") ], title="PDF to Parquet Converter with Detailed Progress", description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset with clear progress indicators." ) iface.launch()