File size: 4,250 Bytes
9634b36 c7a5739 9634b36 7998543 fb7ac68 7998543 c7a5739 9634b36 7998543 c7a5739 7998543 c7a5739 7998543 c7a5739 7998543 c7a5739 9634b36 7998543 c7a5739 7998543 c7a5739 9634b36 c7a5739 9634b36 c7a5739 9634b36 c7a5739 7998543 c7a5739 7998543 c7a5739 9634b36 c7a5739 7998543 9634b36 7998543 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import gradio as gr
import pandas as pd
import fitz # PyMuPDF
import os
from huggingface_hub import HfApi, HfHubHTTPError
import requests
import time
def extract_paragraphs_with_headers(pdf_path, progress=None):
doc = fitz.open(pdf_path)
data = []
total_pages = len(doc)
for page_num, page in enumerate(doc):
if progress:
progress((page_num + 1) / total_pages, desc=f"Processing Page {page_num + 1}/{total_pages}")
blocks = page.get_text("dict")["blocks"]
for block in blocks:
if "lines" in block:
text = ""
for line in block["lines"]:
for span in line["spans"]:
text += span["text"] + " "
text = text.strip()
# Detect headers based on font size
is_header = any(span["size"] > 15 for line in block["lines"] for span in line["spans"])
data.append({
"page_num": page_num + 1,
"text": text,
"is_header": is_header
})
return data
def upload_with_progress(file_path, repo_id, token, progress):
"""
Upload file to Hugging Face Dataset with progress tracking.
"""
file_size = os.path.getsize(file_path)
url = f"https://huggingface.co/api/datasets/{repo_id}/upload"
headers = {
"Authorization": f"Bearer {token}"
}
with open(file_path, 'rb') as f:
chunk_size = 1024 * 1024 # 1MB
uploaded = 0
while True:
chunk = f.read(chunk_size)
if not chunk:
break
response = requests.put(
url,
headers=headers,
data=chunk
)
uploaded += len(chunk)
progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
time.sleep(0.1) # Simulate delay for progress update
if response.status_code != 200:
raise Exception(f"Upload failed: {response.text}")
return f"✅ Successfully uploaded to {repo_id}"
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
all_data = []
# Process each uploaded PDF
for pdf_file in pdf_files:
extracted_data = extract_paragraphs_with_headers(pdf_file.name, progress=progress)
for item in extracted_data:
all_data.append({
'filename': os.path.basename(pdf_file.name),
'page_num': item['page_num'],
'text': item['text'],
'is_header': item['is_header']
})
# Convert to DataFrame
df = pd.DataFrame(all_data)
# Save as Parquet
parquet_file = 'papers_with_headers.parquet'
df.to_parquet(parquet_file, engine='pyarrow', index=False)
upload_message = ""
# Only upload if the user selects it
if action_choice in ["Upload to Hugging Face", "Both"]:
try:
upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
except Exception as e:
upload_message = f"❌ Upload failed: {str(e)}"
# Return Parquet file and status message
return parquet_file, upload_message
# Gradio Interface
iface = gr.Interface(
fn=pdf_to_parquet_and_upload,
inputs=[
gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"),
gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"),
gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"),
gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
],
outputs=[
gr.File(label="Download Parquet File"),
gr.Textbox(label="Status")
],
title="PDF to Parquet Converter with Upload Progress",
description="Upload your PDFs (drag & drop or search), convert them to Parquet, and upload to your own Hugging Face Dataset repo with real-time progress tracking."
)
iface.launch()
|