Spaces:

Jobey1
/

Convert-PDF-To-Parquet-With-paragraph-markers

Sleeping

App Files Files Community

Jobey1 commited on Feb 25

Commit

c7a5739

verified ·

1 Parent(s): b270230

Update app.py

Browse files

Users can drag and drop PDF's or search for PDF's they want to convert to Parquet format. Users can download converted file to local machine, or can choose to load to a hugging faces data set with the API key for their repo. Now API's are collected or saved. Code checks to make sure repo exists before it transfers the files.

Files changed (1) hide show

app.py +84 -26

app.py CHANGED Viewed

@@ -1,45 +1,103 @@
 import gradio as gr
 import pandas as pd
-from pdfminer.high_level import extract_text
 import os
-def extract_paragraphs(text):
-    # Split text into paragraphs based on double line breaks
-    paragraphs = [para.strip() for para in text.split('\n\n') if para.strip()]
-    return paragraphs
-def pdf_to_parquet(pdf_files):
     data = []
-    for pdf_file in pdf_files:
-        # Extract text from PDF
-        text = extract_text(pdf_file.name)
-        # Extract paragraphs
-        paragraphs = extract_paragraphs(text)
-        # Append to data list
-        data.append({
-            'filename': os.path.basename(pdf_file.name),
-            'paragraphs': paragraphs
-        })
     # Convert to DataFrame
-    df = pd.DataFrame(data)
-    # Save to Parquet
-    parquet_file = 'converted_papers.parquet'
     df.to_parquet(parquet_file, engine='pyarrow', index=False)
-    return parquet_file
 # Gradio Interface
 iface = gr.Interface(
-    fn=pdf_to_parquet,
-    inputs=gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs"),
-    outputs=gr.File(label="Download Parquet File"),
-    title="PDF to Parquet Converter with Paragraphs",
-    description="Upload your PDFs, and download a Parquet file with paragraphs preserved for LLM training."
 )
 iface.launch()

 import gradio as gr
 import pandas as pd
+import fitz  # PyMuPDF
 import os
+from huggingface_hub import HfApi, HfHubHTTPError
+def extract_paragraphs_with_headers(pdf_path):
+    doc = fitz.open(pdf_path)
     data = []
+    for page_num, page in enumerate(doc):
+        blocks = page.get_text("dict")["blocks"]
+        for block in blocks:
+            if "lines" in block:
+                text = ""
+                for line in block["lines"]:
+                    for span in line["spans"]:
+                        text += span["text"] + " "
+                text = text.strip()
+                # Detect headers based on font size
+                is_header = any(span["size"] > 15 for line in block["lines"] for span in line["spans"])
+                data.append({
+                    "page_num": page_num + 1,
+                    "text": text,
+                    "is_header": is_header
+                })
+    return data
+def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice):
+    all_data = []
+    for pdf_file in pdf_files:
+        extracted_data = extract_paragraphs_with_headers(pdf_file.name)
+        for item in extracted_data:
+            all_data.append({
+                'filename': os.path.basename(pdf_file.name),
+                'page_num': item['page_num'],
+                'text': item['text'],
+                'is_header': item['is_header']
+            })
     # Convert to DataFrame
+    df = pd.DataFrame(all_data)
+    # Save as Parquet
+    parquet_file = 'papers_with_headers.parquet'
     df.to_parquet(parquet_file, engine='pyarrow', index=False)
+    upload_message = ""
+    # Only upload if the user selects it
+    if action_choice in ["Upload to Hugging Face", "Both"]:
+        try:
+            api = HfApi()
+            api.set_access_token(hf_token)
+            # Validate the user's repo
+            try:
+                api.repo_info(repo_id=dataset_repo_id, repo_type="dataset")
+                repo_exists = True
+            except HfHubHTTPError:
+                repo_exists = False
+            if repo_exists:
+                api.upload_file(
+                    path_or_fileobj=parquet_file,
+                    path_in_repo='papers_with_headers.parquet',
+                    repo_id=dataset_repo_id,
+                    repo_type='dataset'
+                )
+                upload_message = f"✅ Successfully uploaded to {dataset_repo_id}"
+            else:
+                upload_message = "❌ Dataset repo not found. Please check the repo ID."
+        except Exception as e:
+            upload_message = f"❌ Upload failed: {str(e)}"
+    # Return the file for local download + upload status
+    return parquet_file, upload_message
 # Gradio Interface
 iface = gr.Interface(
+    fn=pdf_to_parquet_and_upload,
+    inputs=[
+        gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"),
+        gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"),
+        gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"),
+        gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
+    ],
+    outputs=[
+        gr.File(label="Download Parquet File"),
+        gr.Textbox(label="Status")
+    ],
+    title="PDF to Parquet Converter with User-Controlled Upload",
+    description="Upload your PDFs (drag & drop or search), convert them to Parquet, and upload to your own Hugging Face Dataset repo."
 )
 iface.launch()