Jobey1's picture
Create app.py
9634b36 verified
raw
history blame
1.27 kB
import gradio as gr
import pandas as pd
from pdfminer.high_level import extract_text
import os
def extract_paragraphs(text):
# Split text into paragraphs based on double line breaks
paragraphs = [para.strip() for para in text.split('\n\n') if para.strip()]
return paragraphs
def pdf_to_parquet(pdf_files):
data = []
for pdf_file in pdf_files:
# Extract text from PDF
text = extract_text(pdf_file.name)
# Extract paragraphs
paragraphs = extract_paragraphs(text)
# Append to data list
data.append({
'filename': os.path.basename(pdf_file.name),
'paragraphs': paragraphs
})
# Convert to DataFrame
df = pd.DataFrame(data)
# Save to Parquet
parquet_file = 'converted_papers.parquet'
df.to_parquet(parquet_file, engine='pyarrow', index=False)
return parquet_file
# Gradio Interface
iface = gr.Interface(
fn=pdf_to_parquet,
inputs=gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs"),
outputs=gr.File(label="Download Parquet File"),
title="PDF to Parquet Converter with Paragraphs",
description="Upload your PDFs, and download a Parquet file with paragraphs preserved for LLM training."
)
iface.launch()