import gradio as gr import pandas as pd from pdfminer.high_level import extract_text import os def extract_paragraphs(text): # Split text into paragraphs based on double line breaks paragraphs = [para.strip() for para in text.split('\n\n') if para.strip()] return paragraphs def pdf_to_parquet(pdf_files): data = [] for pdf_file in pdf_files: # Extract text from PDF text = extract_text(pdf_file.name) # Extract paragraphs paragraphs = extract_paragraphs(text) # Append to data list data.append({ 'filename': os.path.basename(pdf_file.name), 'paragraphs': paragraphs }) # Convert to DataFrame df = pd.DataFrame(data) # Save to Parquet parquet_file = 'converted_papers.parquet' df.to_parquet(parquet_file, engine='pyarrow', index=False) return parquet_file # Gradio Interface iface = gr.Interface( fn=pdf_to_parquet, inputs=gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs"), outputs=gr.File(label="Download Parquet File"), title="PDF to Parquet Converter with Paragraphs", description="Upload your PDFs, and download a Parquet file with paragraphs preserved for LLM training." ) iface.launch()