Spaces:

Jobey1
/

Convert-PDF-To-Parquet-With-paragraph-markers

Sleeping

App Files Files Community

Jobey1 commited on Feb 26

Commit

69c287e

verified ·

1 Parent(s): 09053ce

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -95

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from huggingface_hub.utils import HfHubHTTPError
 import time
 def extract_full_paper_with_labels(pdf_path, progress=None):
-    print(f"📄 Starting PDF Processing: {os.path.basename(pdf_path)}")
     doc = fitz.open(pdf_path)
     content = ""
@@ -28,7 +28,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
     # Regex patterns for detection
     doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
     year_pattern = r'\b(19|20)\d{2}\b'
-    code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{|\}|;)"
     reference_keywords = ['reference', 'bibliography', 'sources']
     financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
@@ -122,99 +122,39 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
         "content": content
     }
-def upload_with_progress(file_path, repo_id, token, progress):
-    """
-    Upload file to Hugging Face Dataset using upload_file() API method.
-    """
-    print(f"📤 Starting upload of Parquet: {file_path}")
-    file_size = os.path.getsize(file_path)
-    api = HfApi()
-    try:
-        # Use upload_file() method from huggingface_hub
-        api.upload_file(
-            path_or_fileobj=file_path,
-            path_in_repo=os.path.basename(file_path),
-            repo_id=repo_id,
-            repo_type="dataset",
-            token=token
-        )
-        if progress is not None:
-            progress(1, desc="✅ Upload Complete")
-        print(f"✅ Successfully uploaded to {repo_id}")
-        return f"✅ Successfully uploaded to {repo_id}"
-    except HfHubHTTPError as e:
-        print(f"❌ Upload failed: {e}")
-        return f"❌ Upload failed: {str(e)}"
-    except Exception as e:
-        print(f"❌ Unexpected error: {e}")
-        return f"❌ Unexpected error: {str(e)}"
-def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, state, progress=gr.Progress()):
-    all_data = []
-    total_files = len(pdf_files)
-    print("🚀 Starting PDF to Parquet Conversion Process")
-    for idx, pdf_file in enumerate(pdf_files):
-        if progress:
-            progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
-        extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
-        all_data.append(extracted_data)
-    print("🟡 Converting Processed Data to Parquet")
-    df = pd.DataFrame(all_data)
-    parquet_file = 'fully_labeled_papers.parquet'
-    try:
-        df.to_parquet(parquet_file, engine='pyarrow', index=False)
-        print("✅ Parquet Conversion Completed")
-    except Exception as e:
-        print(f"❌ Parquet Conversion Failed: {str(e)}")
-        return None, f"❌ Parquet Conversion Failed: {str(e)}", state
-    upload_message = "Skipped Upload"
-    # ✅ Upload Parquet if selected
-    if action_choice in ["Upload to Hugging Face", "Both"]:
-        try:
-            upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
-        except Exception as e:
-            print(f"❌ Upload Failed: {str(e)}")
-            upload_message = f"❌ Upload failed: {str(e)}"
-    print("🏁 Process Completed")
-    # ✅ Clear Uploaded PDFs and Parquet File
-    if os.path.exists(parquet_file):
-        os.remove(parquet_file)
-        print("🗑️ Parquet file cleared after processing.")
-    return None, upload_message, state
-# ✅ Gradio Interface
-iface = gr.Interface(
-    fn=pdf_to_parquet_and_upload,
-    inputs=[
-        gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"),
-        gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"),
-        gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"),
-        gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
-    ],
-    outputs=[
-        gr.File(label="Download Parquet File"),
-        gr.Textbox(label="Status")
-    ],
-    title="PDF to Parquet Converter with Full Labeling",
-    description="Upload your PDFs, convert them to Parquet with full section labeling, and upload to your Hugging Face Dataset."
-)
-iface.launch()

 import time
 def extract_full_paper_with_labels(pdf_path, progress=None):
+    print(f"📝 Starting PDF Processing: {os.path.basename(pdf_path)}")
     doc = fitz.open(pdf_path)
     content = ""
     # Regex patterns for detection
     doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
     year_pattern = r'\b(19|20)\d{2}\b'
+    code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{|}|;)"
     reference_keywords = ['reference', 'bibliography', 'sources']
     financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
         "content": content
     }
+# NEW: Function to clear file-related inputs/outputs only.
+def clear_files():
+    # Return empty values for file input and output display.
+    # Notice that we do NOT return anything for the API key or repo address.
+    return None, "", None
+# Gradio interface setup
+with gr.Blocks() as demo:
+    with gr.Row():
+        api_key_input = gr.Textbox(label="API Key", placeholder="Enter API Key")
+        repo_address_input = gr.Textbox(label="Repo Address", placeholder="Enter Repo Address")
+    with gr.Row():
+        pdf_file_input = gr.File(label="Upload PDF")
+        convert_button = gr.Button("Convert to Parquet")
+        clear_button = gr.Button("Clear Files")
+    output_display = gr.Textbox(label="Output")
+    # (Optional) A hidden textbox for parquet data, if used later.
+    parquet_output = gr.Textbox(label="Parquet Data", visible=False)
+    convert_button.click(
+        extract_full_paper_with_labels,
+        inputs=pdf_file_input,
+        outputs=output_display
+    )
+    # The clear button now only clears file-related components;
+    # API key and Repo Address remain untouched.
+    clear_button.click(
+        clear_files,
+        inputs=None,
+        outputs=[pdf_file_input, output_display, parquet_output]
+    )
+demo.launch()