Jobey1's picture
Update app.py
c7a5739 verified
raw
history blame
3.6 kB
import gradio as gr
import pandas as pd
import fitz # PyMuPDF
import os
from huggingface_hub import HfApi, HfHubHTTPError
def extract_paragraphs_with_headers(pdf_path):
doc = fitz.open(pdf_path)
data = []
for page_num, page in enumerate(doc):
blocks = page.get_text("dict")["blocks"]
for block in blocks:
if "lines" in block:
text = ""
for line in block["lines"]:
for span in line["spans"]:
text += span["text"] + " "
text = text.strip()
# Detect headers based on font size
is_header = any(span["size"] > 15 for line in block["lines"] for span in line["spans"])
data.append({
"page_num": page_num + 1,
"text": text,
"is_header": is_header
})
return data
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice):
all_data = []
for pdf_file in pdf_files:
extracted_data = extract_paragraphs_with_headers(pdf_file.name)
for item in extracted_data:
all_data.append({
'filename': os.path.basename(pdf_file.name),
'page_num': item['page_num'],
'text': item['text'],
'is_header': item['is_header']
})
# Convert to DataFrame
df = pd.DataFrame(all_data)
# Save as Parquet
parquet_file = 'papers_with_headers.parquet'
df.to_parquet(parquet_file, engine='pyarrow', index=False)
upload_message = ""
# Only upload if the user selects it
if action_choice in ["Upload to Hugging Face", "Both"]:
try:
api = HfApi()
api.set_access_token(hf_token)
# Validate the user's repo
try:
api.repo_info(repo_id=dataset_repo_id, repo_type="dataset")
repo_exists = True
except HfHubHTTPError:
repo_exists = False
if repo_exists:
api.upload_file(
path_or_fileobj=parquet_file,
path_in_repo='papers_with_headers.parquet',
repo_id=dataset_repo_id,
repo_type='dataset'
)
upload_message = f"βœ… Successfully uploaded to {dataset_repo_id}"
else:
upload_message = "❌ Dataset repo not found. Please check the repo ID."
except Exception as e:
upload_message = f"❌ Upload failed: {str(e)}"
# Return the file for local download + upload status
return parquet_file, upload_message
# Gradio Interface
iface = gr.Interface(
fn=pdf_to_parquet_and_upload,
inputs=[
gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"),
gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"),
gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"),
gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
],
outputs=[
gr.File(label="Download Parquet File"),
gr.Textbox(label="Status")
],
title="PDF to Parquet Converter with User-Controlled Upload",
description="Upload your PDFs (drag & drop or search), convert them to Parquet, and upload to your own Hugging Face Dataset repo."
)
iface.launch()