|
import gradio as gr |
|
import pandas as pd |
|
from pdfminer.high_level import extract_text |
|
import os |
|
|
|
def extract_paragraphs(text): |
|
|
|
paragraphs = [para.strip() for para in text.split('\n\n') if para.strip()] |
|
return paragraphs |
|
|
|
def pdf_to_parquet(pdf_files): |
|
data = [] |
|
|
|
for pdf_file in pdf_files: |
|
|
|
text = extract_text(pdf_file.name) |
|
|
|
|
|
paragraphs = extract_paragraphs(text) |
|
|
|
|
|
data.append({ |
|
'filename': os.path.basename(pdf_file.name), |
|
'paragraphs': paragraphs |
|
}) |
|
|
|
|
|
df = pd.DataFrame(data) |
|
|
|
|
|
parquet_file = 'converted_papers.parquet' |
|
df.to_parquet(parquet_file, engine='pyarrow', index=False) |
|
|
|
return parquet_file |
|
|
|
|
|
iface = gr.Interface( |
|
fn=pdf_to_parquet, |
|
inputs=gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs"), |
|
outputs=gr.File(label="Download Parquet File"), |
|
title="PDF to Parquet Converter with Paragraphs", |
|
description="Upload your PDFs, and download a Parquet file with paragraphs preserved for LLM training." |
|
) |
|
|
|
iface.launch() |
|
|