Spaces:

Jobey1
/

Convert-PDF-To-Parquet-With-paragraph-markers

Sleeping

App Files Files Community

Convert-PDF-To-Parquet-With-paragraph-markers / app.py

Jobey1

Update app.py

c7a5739 verified about 1 month ago

raw

history blame

3.6 kB

	import gradio as gr
	import pandas as pd
	import fitz # PyMuPDF
	import os
	from huggingface_hub import HfApi, HfHubHTTPError

	def extract_paragraphs_with_headers(pdf_path):
	doc = fitz.open(pdf_path)
	data = []

	for page_num, page in enumerate(doc):
	blocks = page.get_text("dict")["blocks"]
	for block in blocks:
	if "lines" in block:
	text = ""
	for line in block["lines"]:
	for span in line["spans"]:
	text += span["text"] + " "

	text = text.strip()

	# Detect headers based on font size
	is_header = any(span["size"] > 15 for line in block["lines"] for span in line["spans"])

	data.append({
	"page_num": page_num + 1,
	"text": text,
	"is_header": is_header
	})

	return data

	def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice):
	all_data = []

	for pdf_file in pdf_files:
	extracted_data = extract_paragraphs_with_headers(pdf_file.name)

	for item in extracted_data:
	all_data.append({
	'filename': os.path.basename(pdf_file.name),
	'page_num': item['page_num'],
	'text': item['text'],
	'is_header': item['is_header']
	})

	# Convert to DataFrame
	df = pd.DataFrame(all_data)

	# Save as Parquet
	parquet_file = 'papers_with_headers.parquet'
	df.to_parquet(parquet_file, engine='pyarrow', index=False)

	upload_message = ""

	# Only upload if the user selects it
	if action_choice in ["Upload to Hugging Face", "Both"]:
	try:
	api = HfApi()
	api.set_access_token(hf_token)

	# Validate the user's repo
	try:
	api.repo_info(repo_id=dataset_repo_id, repo_type="dataset")
	repo_exists = True
	except HfHubHTTPError:
	repo_exists = False

	if repo_exists:
	api.upload_file(
	path_or_fileobj=parquet_file,
	path_in_repo='papers_with_headers.parquet',
	repo_id=dataset_repo_id,
	repo_type='dataset'
	)
	upload_message = f"✅ Successfully uploaded to {dataset_repo_id}"
	else:
	upload_message = "❌ Dataset repo not found. Please check the repo ID."

	except Exception as e:
	upload_message = f"❌ Upload failed: {str(e)}"

	# Return the file for local download + upload status
	return parquet_file, upload_message

	# Gradio Interface
	iface = gr.Interface(
	fn=pdf_to_parquet_and_upload,
	inputs=[
	gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"),
	gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"),
	gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"),
	gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
	],
	outputs=[
	gr.File(label="Download Parquet File"),
	gr.Textbox(label="Status")
	],
	title="PDF to Parquet Converter with User-Controlled Upload",
	description="Upload your PDFs (drag & drop or search), convert them to Parquet, and upload to your own Hugging Face Dataset repo."
	)

	iface.launch()