Jobey1's picture
Update app.py
06449e7 verified
raw
history blame
5.65 kB
import gradio as gr
import pandas as pd
import fitz # PyMuPDF
import os
from huggingface_hub import HfApi
from huggingface_hub.utils import HfHubHTTPError
import requests
import time
def extract_paragraphs_with_headers(pdf_path, progress=None):
print(f"πŸ“„ Starting PDF Processing: {os.path.basename(pdf_path)}")
doc = fitz.open(pdf_path)
data = []
total_pages = len(doc)
max_iterations = total_pages * 2 # To prevent infinite loops
iteration_count = 0
for page_num, page in enumerate(doc):
iteration_count += 1
if iteration_count > max_iterations:
raise Exception("⚠️ PDF processing exceeded iteration limit. Possible malformed PDF.")
if progress is not None:
progress((page_num + 1) / total_pages, desc=f"Processing Page {page_num + 1}/{total_pages}")
blocks = page.get_text("dict")["blocks"]
for block in blocks:
if "lines" in block:
text = ""
for line in block["lines"]:
for span in line["spans"]:
text += span["text"] + " "
text = text.strip()
# Detect headers based on font size
is_header = any(span["size"] > 15 for line in block["lines"] for span in line["spans"])
data.append({
"page_num": page_num + 1,
"text": text,
"is_header": is_header
})
print(f"βœ… Finished Processing PDF: {os.path.basename(pdf_path)}")
return data
def upload_with_progress(file_path, repo_id, token, progress):
"""
Upload file to Hugging Face Dataset with progress tracking.
"""
print(f"πŸ“€ Starting upload of Parquet: {file_path}")
file_size = os.path.getsize(file_path)
url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
headers = {
"Authorization": f"Bearer {token}"
}
with open(file_path, 'rb') as f:
chunk_size = 1024 * 1024 # 1MB
uploaded = 0
max_chunks = file_size // chunk_size + 10 # Safety limit to avoid infinite loops
chunk_count = 0
while True:
chunk = f.read(chunk_size)
if not chunk:
break
response = requests.put(
url,
headers=headers,
data=chunk
)
uploaded += len(chunk)
if progress is not None:
progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
time.sleep(0.1) # Smooth progress update
chunk_count += 1
if chunk_count > max_chunks:
raise Exception("⚠️ Upload exceeded expected chunk limit. Aborting.")
if response.status_code != 200:
raise Exception(f"❌ Upload failed: {response.text}")
print(f"βœ… Successfully uploaded to {repo_id}")
return f"βœ… Successfully uploaded to {repo_id}"
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
all_data = []
total_files = len(pdf_files)
print("πŸš€ Starting PDF to Parquet Conversion Process")
for idx, pdf_file in enumerate(pdf_files):
if progress is not None:
progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
# βœ… Step 1: Process PDF
extracted_data = extract_paragraphs_with_headers(pdf_file.name, progress=progress)
for item in extracted_data:
all_data.append({
'filename': os.path.basename(pdf_file.name),
'page_num': item['page_num'],
'text': item['text'],
'is_header': item['is_header']
})
print("🟑 Converting Processed Data to Parquet")
# βœ… Step 2: Convert to Parquet
df = pd.DataFrame(all_data)
parquet_file = 'papers_with_headers.parquet'
try:
df.to_parquet(parquet_file, engine='pyarrow', index=False)
print("βœ… Parquet Conversion Completed")
except Exception as e:
print(f"❌ Parquet Conversion Failed: {str(e)}")
return None, f"❌ Parquet Conversion Failed: {str(e)}"
upload_message = "Skipped Upload"
# βœ… Step 3: Upload Parquet (if selected)
if action_choice in ["Upload to Hugging Face", "Both"]:
try:
upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
except Exception as e:
print(f"❌ Upload Failed: {str(e)}")
upload_message = f"❌ Upload failed: {str(e)}"
print("🏁 Process Completed")
return parquet_file, upload_message
# βœ… Gradio Interface
iface = gr.Interface(
fn=pdf_to_parquet_and_upload,
inputs=[
gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"),
gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"),
gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"),
gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
],
outputs=[
gr.File(label="Download Parquet File"),
gr.Textbox(label="Status")
],
title="PDF to Parquet Converter with Detailed Progress",
description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset with clear progress indicators."
)
iface.launch()