Spaces:

Jobey1
/

Convert-PDF-To-Parquet-With-paragraph-markers

Sleeping

App Files Files Community

Convert-PDF-To-Parquet-With-paragraph-markers / app.py

Jobey1

Update app.py

dfa54c4 verified 8 months ago

raw

history blame

5.15 kB

	import gradio as gr
	import pandas as pd
	import fitz # PyMuPDF
	import os
	from huggingface_hub import HfApi
	from huggingface_hub.utils import HfHubHTTPError
	import time

	def extract_paragraphs_with_headers(pdf_path, progress=None):
	print(f"📄 Starting PDF Processing: {os.path.basename(pdf_path)}")
	doc = fitz.open(pdf_path)
	data = []

	total_pages = len(doc)
	max_iterations = total_pages * 2 # To prevent infinite loops
	iteration_count = 0

	for page_num, page in enumerate(doc):
	iteration_count += 1
	if iteration_count > max_iterations:
	raise Exception("⚠️ PDF processing exceeded iteration limit. Possible malformed PDF.")

	if progress is not None:
	progress((page_num + 1) / total_pages, desc=f"Processing Page {page_num + 1}/{total_pages}")

	blocks = page.get_text("dict")["blocks"]
	for block in blocks:
	if "lines" in block:
	text = ""
	for line in block["lines"]:
	for span in line["spans"]:
	text += span["text"] + " "

	text = text.strip()

	# Detect headers based on font size
	is_header = any(span["size"] > 15 for line in block["lines"] for span in line["spans"])

	data.append({
	"page_num": page_num + 1,
	"text": text,
	"is_header": is_header
	})

	print(f"✅ Finished Processing PDF: {os.path.basename(pdf_path)}")
	return data

	def upload_with_progress(file_path, repo_id, token, progress):
	"""
	Upload file to Hugging Face Dataset using upload_file() API method.
	"""
	print(f"📤 Starting upload of Parquet: {file_path}")
	file_size = os.path.getsize(file_path)

	api = HfApi()

	try:
	# Use upload_file() method from huggingface_hub
	api.upload_file(
	path_or_fileobj=file_path,
	path_in_repo=os.path.basename(file_path),
	repo_id=repo_id,
	repo_type="dataset",
	token=token
	)

	if progress is not None:
	progress(1, desc="✅ Upload Complete")

	print(f"✅ Successfully uploaded to {repo_id}")
	return f"✅ Successfully uploaded to {repo_id}"

	except HfHubHTTPError as e:
	print(f"❌ Upload failed: {e}")
	return f"❌ Upload failed: {str(e)}"
	except Exception as e:
	print(f"❌ Unexpected error: {e}")
	return f"❌ Unexpected error: {str(e)}"

	def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
	all_data = []

	total_files = len(pdf_files)
	print("🚀 Starting PDF to Parquet Conversion Process")

	for idx, pdf_file in enumerate(pdf_files):
	if progress is not None:
	progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")

	# ✅ Step 1: Process PDF
	extracted_data = extract_paragraphs_with_headers(pdf_file.name, progress=progress)
	for item in extracted_data:
	all_data.append({
	'filename': os.path.basename(pdf_file.name),
	'page_num': item['page_num'],
	'text': item['text'],
	'is_header': item['is_header']
	})

	print("🟡 Converting Processed Data to Parquet")
	# ✅ Step 2: Convert to Parquet
	df = pd.DataFrame(all_data)
	parquet_file = 'papers_with_headers.parquet'

	try:
	df.to_parquet(parquet_file, engine='pyarrow', index=False)
	print("✅ Parquet Conversion Completed")
	except Exception as e:
	print(f"❌ Parquet Conversion Failed: {str(e)}")
	return None, f"❌ Parquet Conversion Failed: {str(e)}"

	upload_message = "Skipped Upload"

	# ✅ Step 3: Upload Parquet (if selected)
	if action_choice in ["Upload to Hugging Face", "Both"]:
	try:
	upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
	except Exception as e:
	print(f"❌ Upload Failed: {str(e)}")
	upload_message = f"❌ Upload failed: {str(e)}"

	print("🏁 Process Completed")
	return parquet_file, upload_message

	# ✅ Gradio Interface
	iface = gr.Interface(
	fn=pdf_to_parquet_and_upload,
	inputs=[
	gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"),
	gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"),
	gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"),
	gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
	],
	outputs=[
	gr.File(label="Download Parquet File"),
	gr.Textbox(label="Status")
	],
	title="PDF to Parquet Converter with Correct Upload API",
	description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset using the official API."
	)

	iface.launch()