|
import gradio as gr |
|
import pandas as pd |
|
import fitz |
|
import os |
|
from huggingface_hub import HfApi |
|
from huggingface_hub.utils import HfHubHTTPError |
|
import requests |
|
import time |
|
|
|
def extract_paragraphs_with_headers(pdf_path, progress=None): |
|
print(f"π Starting PDF Processing: {os.path.basename(pdf_path)}") |
|
doc = fitz.open(pdf_path) |
|
data = [] |
|
|
|
total_pages = len(doc) |
|
max_iterations = total_pages * 2 |
|
iteration_count = 0 |
|
|
|
for page_num, page in enumerate(doc): |
|
iteration_count += 1 |
|
if iteration_count > max_iterations: |
|
raise Exception("β οΈ PDF processing exceeded iteration limit. Possible malformed PDF.") |
|
|
|
if progress is not None: |
|
progress((page_num + 1) / total_pages, desc=f"Processing Page {page_num + 1}/{total_pages}") |
|
|
|
blocks = page.get_text("dict")["blocks"] |
|
for block in blocks: |
|
if "lines" in block: |
|
text = "" |
|
for line in block["lines"]: |
|
for span in line["spans"]: |
|
text += span["text"] + " " |
|
|
|
text = text.strip() |
|
|
|
|
|
is_header = any(span["size"] > 15 for line in block["lines"] for span in line["spans"]) |
|
|
|
data.append({ |
|
"page_num": page_num + 1, |
|
"text": text, |
|
"is_header": is_header |
|
}) |
|
|
|
print(f"β
Finished Processing PDF: {os.path.basename(pdf_path)}") |
|
return data |
|
|
|
def upload_with_progress(file_path, repo_id, token, progress): |
|
""" |
|
Upload file to Hugging Face Dataset with progress tracking. |
|
""" |
|
print(f"π€ Starting upload of Parquet: {file_path}") |
|
file_size = os.path.getsize(file_path) |
|
url = f"https://huggingface.co/api/datasets/{repo_id}/upload" |
|
|
|
headers = { |
|
"Authorization": f"Bearer {token}" |
|
} |
|
|
|
with open(file_path, 'rb') as f: |
|
chunk_size = 1024 * 1024 |
|
uploaded = 0 |
|
max_chunks = file_size // chunk_size + 10 |
|
chunk_count = 0 |
|
|
|
while True: |
|
chunk = f.read(chunk_size) |
|
if not chunk: |
|
break |
|
|
|
response = requests.put( |
|
url, |
|
headers=headers, |
|
data=chunk |
|
) |
|
|
|
uploaded += len(chunk) |
|
if progress is not None: |
|
progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB") |
|
time.sleep(0.1) |
|
|
|
chunk_count += 1 |
|
if chunk_count > max_chunks: |
|
raise Exception("β οΈ Upload exceeded expected chunk limit. Aborting.") |
|
|
|
if response.status_code != 200: |
|
raise Exception(f"β Upload failed: {response.text}") |
|
|
|
print(f"β
Successfully uploaded to {repo_id}") |
|
return f"β
Successfully uploaded to {repo_id}" |
|
|
|
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()): |
|
all_data = [] |
|
|
|
total_files = len(pdf_files) |
|
print("π Starting PDF to Parquet Conversion Process") |
|
|
|
for idx, pdf_file in enumerate(pdf_files): |
|
if progress is not None: |
|
progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}") |
|
|
|
|
|
extracted_data = extract_paragraphs_with_headers(pdf_file.name, progress=progress) |
|
for item in extracted_data: |
|
all_data.append({ |
|
'filename': os.path.basename(pdf_file.name), |
|
'page_num': item['page_num'], |
|
'text': item['text'], |
|
'is_header': item['is_header'] |
|
}) |
|
|
|
print("π‘ Converting Processed Data to Parquet") |
|
|
|
df = pd.DataFrame(all_data) |
|
parquet_file = 'papers_with_headers.parquet' |
|
|
|
try: |
|
df.to_parquet(parquet_file, engine='pyarrow', index=False) |
|
print("β
Parquet Conversion Completed") |
|
except Exception as e: |
|
print(f"β Parquet Conversion Failed: {str(e)}") |
|
return None, f"β Parquet Conversion Failed: {str(e)}" |
|
|
|
upload_message = "Skipped Upload" |
|
|
|
|
|
if action_choice in ["Upload to Hugging Face", "Both"]: |
|
try: |
|
upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress) |
|
except Exception as e: |
|
print(f"β Upload Failed: {str(e)}") |
|
upload_message = f"β Upload failed: {str(e)}" |
|
|
|
print("π Process Completed") |
|
return parquet_file, upload_message |
|
|
|
|
|
iface = gr.Interface( |
|
fn=pdf_to_parquet_and_upload, |
|
inputs=[ |
|
gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"), |
|
gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"), |
|
gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"), |
|
gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally") |
|
], |
|
outputs=[ |
|
gr.File(label="Download Parquet File"), |
|
gr.Textbox(label="Status") |
|
], |
|
title="PDF to Parquet Converter with Detailed Progress", |
|
description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset with clear progress indicators." |
|
) |
|
|
|
iface.launch() |
|
|