Jobey1 commited on
Commit
9634b36
·
verified ·
1 Parent(s): f8e3b23

Create app.py

Browse files

Code to convert any PDF to parquet format. This code creates markers to show where paragraphs start and end. This is important for chunking this data when training LLM's. It keeps the paragraphs together in the chunks.

Files changed (1) hide show
  1. app.py +45 -0
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from pdfminer.high_level import extract_text
4
+ import os
5
+
6
+ def extract_paragraphs(text):
7
+ # Split text into paragraphs based on double line breaks
8
+ paragraphs = [para.strip() for para in text.split('\n\n') if para.strip()]
9
+ return paragraphs
10
+
11
+ def pdf_to_parquet(pdf_files):
12
+ data = []
13
+
14
+ for pdf_file in pdf_files:
15
+ # Extract text from PDF
16
+ text = extract_text(pdf_file.name)
17
+
18
+ # Extract paragraphs
19
+ paragraphs = extract_paragraphs(text)
20
+
21
+ # Append to data list
22
+ data.append({
23
+ 'filename': os.path.basename(pdf_file.name),
24
+ 'paragraphs': paragraphs
25
+ })
26
+
27
+ # Convert to DataFrame
28
+ df = pd.DataFrame(data)
29
+
30
+ # Save to Parquet
31
+ parquet_file = 'converted_papers.parquet'
32
+ df.to_parquet(parquet_file, engine='pyarrow', index=False)
33
+
34
+ return parquet_file
35
+
36
+ # Gradio Interface
37
+ iface = gr.Interface(
38
+ fn=pdf_to_parquet,
39
+ inputs=gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs"),
40
+ outputs=gr.File(label="Download Parquet File"),
41
+ title="PDF to Parquet Converter with Paragraphs",
42
+ description="Upload your PDFs, and download a Parquet file with paragraphs preserved for LLM training."
43
+ )
44
+
45
+ iface.launch()