Create app.py
Browse filesCode to convert any PDF to parquet format. This code creates markers to show where paragraphs start and end. This is important for chunking this data when training LLM's. It keeps the paragraphs together in the chunks.
app.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from pdfminer.high_level import extract_text
|
4 |
+
import os
|
5 |
+
|
6 |
+
def extract_paragraphs(text):
|
7 |
+
# Split text into paragraphs based on double line breaks
|
8 |
+
paragraphs = [para.strip() for para in text.split('\n\n') if para.strip()]
|
9 |
+
return paragraphs
|
10 |
+
|
11 |
+
def pdf_to_parquet(pdf_files):
|
12 |
+
data = []
|
13 |
+
|
14 |
+
for pdf_file in pdf_files:
|
15 |
+
# Extract text from PDF
|
16 |
+
text = extract_text(pdf_file.name)
|
17 |
+
|
18 |
+
# Extract paragraphs
|
19 |
+
paragraphs = extract_paragraphs(text)
|
20 |
+
|
21 |
+
# Append to data list
|
22 |
+
data.append({
|
23 |
+
'filename': os.path.basename(pdf_file.name),
|
24 |
+
'paragraphs': paragraphs
|
25 |
+
})
|
26 |
+
|
27 |
+
# Convert to DataFrame
|
28 |
+
df = pd.DataFrame(data)
|
29 |
+
|
30 |
+
# Save to Parquet
|
31 |
+
parquet_file = 'converted_papers.parquet'
|
32 |
+
df.to_parquet(parquet_file, engine='pyarrow', index=False)
|
33 |
+
|
34 |
+
return parquet_file
|
35 |
+
|
36 |
+
# Gradio Interface
|
37 |
+
iface = gr.Interface(
|
38 |
+
fn=pdf_to_parquet,
|
39 |
+
inputs=gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs"),
|
40 |
+
outputs=gr.File(label="Download Parquet File"),
|
41 |
+
title="PDF to Parquet Converter with Paragraphs",
|
42 |
+
description="Upload your PDFs, and download a Parquet file with paragraphs preserved for LLM training."
|
43 |
+
)
|
44 |
+
|
45 |
+
iface.launch()
|