File size: 1,270 Bytes
9634b36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import gradio as gr
import pandas as pd
from pdfminer.high_level import extract_text
import os

def extract_paragraphs(text):
    # Split text into paragraphs based on double line breaks
    paragraphs = [para.strip() for para in text.split('\n\n') if para.strip()]
    return paragraphs

def pdf_to_parquet(pdf_files):
    data = []

    for pdf_file in pdf_files:
        # Extract text from PDF
        text = extract_text(pdf_file.name)

        # Extract paragraphs
        paragraphs = extract_paragraphs(text)

        # Append to data list
        data.append({
            'filename': os.path.basename(pdf_file.name),
            'paragraphs': paragraphs
        })

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Save to Parquet
    parquet_file = 'converted_papers.parquet'
    df.to_parquet(parquet_file, engine='pyarrow', index=False)

    return parquet_file

# Gradio Interface
iface = gr.Interface(
    fn=pdf_to_parquet,
    inputs=gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs"),
    outputs=gr.File(label="Download Parquet File"),
    title="PDF to Parquet Converter with Paragraphs",
    description="Upload your PDFs, and download a Parquet file with paragraphs preserved for LLM training."
)

iface.launch()