Update app.py
Browse filesUsers can drag and drop PDF's or search for PDF's they want to convert to Parquet format. Users can download converted file to local machine, or can choose to load to a hugging faces data set with the API key for their repo. Now API's are collected or saved. Code checks to make sure repo exists before it transfers the files.
app.py
CHANGED
@@ -1,45 +1,103 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
-
|
4 |
import os
|
|
|
5 |
|
6 |
-
def
|
7 |
-
|
8 |
-
paragraphs = [para.strip() for para in text.split('\n\n') if para.strip()]
|
9 |
-
return paragraphs
|
10 |
-
|
11 |
-
def pdf_to_parquet(pdf_files):
|
12 |
data = []
|
13 |
|
14 |
-
for
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
|
19 |
-
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
26 |
|
27 |
# Convert to DataFrame
|
28 |
-
df = pd.DataFrame(
|
29 |
|
30 |
-
# Save
|
31 |
-
parquet_file = '
|
32 |
df.to_parquet(parquet_file, engine='pyarrow', index=False)
|
33 |
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
# Gradio Interface
|
37 |
iface = gr.Interface(
|
38 |
-
fn=
|
39 |
-
inputs=
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
)
|
44 |
|
45 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
+
import fitz # PyMuPDF
|
4 |
import os
|
5 |
+
from huggingface_hub import HfApi, HfHubHTTPError
|
6 |
|
7 |
+
def extract_paragraphs_with_headers(pdf_path):
|
8 |
+
doc = fitz.open(pdf_path)
|
|
|
|
|
|
|
|
|
9 |
data = []
|
10 |
|
11 |
+
for page_num, page in enumerate(doc):
|
12 |
+
blocks = page.get_text("dict")["blocks"]
|
13 |
+
for block in blocks:
|
14 |
+
if "lines" in block:
|
15 |
+
text = ""
|
16 |
+
for line in block["lines"]:
|
17 |
+
for span in line["spans"]:
|
18 |
+
text += span["text"] + " "
|
19 |
+
|
20 |
+
text = text.strip()
|
21 |
+
|
22 |
+
# Detect headers based on font size
|
23 |
+
is_header = any(span["size"] > 15 for line in block["lines"] for span in line["spans"])
|
24 |
+
|
25 |
+
data.append({
|
26 |
+
"page_num": page_num + 1,
|
27 |
+
"text": text,
|
28 |
+
"is_header": is_header
|
29 |
+
})
|
30 |
+
|
31 |
+
return data
|
32 |
+
|
33 |
+
def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice):
|
34 |
+
all_data = []
|
35 |
|
36 |
+
for pdf_file in pdf_files:
|
37 |
+
extracted_data = extract_paragraphs_with_headers(pdf_file.name)
|
38 |
|
39 |
+
for item in extracted_data:
|
40 |
+
all_data.append({
|
41 |
+
'filename': os.path.basename(pdf_file.name),
|
42 |
+
'page_num': item['page_num'],
|
43 |
+
'text': item['text'],
|
44 |
+
'is_header': item['is_header']
|
45 |
+
})
|
46 |
|
47 |
# Convert to DataFrame
|
48 |
+
df = pd.DataFrame(all_data)
|
49 |
|
50 |
+
# Save as Parquet
|
51 |
+
parquet_file = 'papers_with_headers.parquet'
|
52 |
df.to_parquet(parquet_file, engine='pyarrow', index=False)
|
53 |
|
54 |
+
upload_message = ""
|
55 |
+
|
56 |
+
# Only upload if the user selects it
|
57 |
+
if action_choice in ["Upload to Hugging Face", "Both"]:
|
58 |
+
try:
|
59 |
+
api = HfApi()
|
60 |
+
api.set_access_token(hf_token)
|
61 |
+
|
62 |
+
# Validate the user's repo
|
63 |
+
try:
|
64 |
+
api.repo_info(repo_id=dataset_repo_id, repo_type="dataset")
|
65 |
+
repo_exists = True
|
66 |
+
except HfHubHTTPError:
|
67 |
+
repo_exists = False
|
68 |
+
|
69 |
+
if repo_exists:
|
70 |
+
api.upload_file(
|
71 |
+
path_or_fileobj=parquet_file,
|
72 |
+
path_in_repo='papers_with_headers.parquet',
|
73 |
+
repo_id=dataset_repo_id,
|
74 |
+
repo_type='dataset'
|
75 |
+
)
|
76 |
+
upload_message = f"✅ Successfully uploaded to {dataset_repo_id}"
|
77 |
+
else:
|
78 |
+
upload_message = "❌ Dataset repo not found. Please check the repo ID."
|
79 |
+
|
80 |
+
except Exception as e:
|
81 |
+
upload_message = f"❌ Upload failed: {str(e)}"
|
82 |
+
|
83 |
+
# Return the file for local download + upload status
|
84 |
+
return parquet_file, upload_message
|
85 |
|
86 |
# Gradio Interface
|
87 |
iface = gr.Interface(
|
88 |
+
fn=pdf_to_parquet_and_upload,
|
89 |
+
inputs=[
|
90 |
+
gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"),
|
91 |
+
gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"),
|
92 |
+
gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"),
|
93 |
+
gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
|
94 |
+
],
|
95 |
+
outputs=[
|
96 |
+
gr.File(label="Download Parquet File"),
|
97 |
+
gr.Textbox(label="Status")
|
98 |
+
],
|
99 |
+
title="PDF to Parquet Converter with User-Controlled Upload",
|
100 |
+
description="Upload your PDFs (drag & drop or search), convert them to Parquet, and upload to your own Hugging Face Dataset repo."
|
101 |
)
|
102 |
|
103 |
iface.launch()
|