import gradio as gr import pandas as pd import fitz # PyMuPDF import os import re from huggingface_hub import HfApi from huggingface_hub.utils import HfHubHTTPError import time def sanitize_title(title, max_length=100): """ Sanitize the paper title to be safe for use as a filename. Removes non-alphanumeric characters (except underscores and hyphens) and truncates to max_length characters. """ sanitized = re.sub(r'[^\w\s-]', '', title).strip() # Remove unwanted characters sanitized = re.sub(r'[-\s]+', '_', sanitized) # Replace spaces and hyphens with underscores if len(sanitized) > max_length: sanitized = sanitized[:max_length] return sanitized def extract_full_paper_with_labels(pdf_path, progress=None): print(f"📄 Starting PDF Processing: {os.path.basename(pdf_path)}") doc = fitz.open(pdf_path) content = "" # Initialize metadata title = "" authors = "" year = "" doi = "" abstract = "" footnotes = "" references = "" sources = "" total_pages = len(doc) max_iterations = total_pages * 2 # To prevent infinite loops iteration_count = 0 # Regex patterns for detection doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b" year_pattern = r'\b(19|20)\d{2}\b' code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{|\}|;)" reference_keywords = ['reference', 'bibliography', 'sources'] financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown'] for page_num, page in enumerate(doc): iteration_count += 1 if iteration_count > max_iterations: raise Exception("⚠️ PDF processing exceeded iteration limit. Possible malformed PDF.") if progress is not None: progress((page_num + 1) / total_pages, desc=f"Processing Page {page_num + 1}/{total_pages}") blocks = page.get_text("dict")["blocks"] for block in blocks: if "lines" in block: text = "" max_font_size = 0 for line in block["lines"]: for span in line["spans"]: text += span["text"] + " " if span["size"] > max_font_size: max_font_size = span["size"] text = text.strip() # Title (First Page, Largest Font) if page_num == 0 and max_font_size > 15 and not title: title = text content += f"{title}\n" # Authors elif re.search(r'author|by', text, re.IGNORECASE) and not authors: authors = text content += f"{authors}\n" # Year elif re.search(year_pattern, text) and not year: year = re.search(year_pattern, text).group(0) content += f"{year}\n" # DOI elif re.search(doi_pattern, text) and not doi: doi = re.search(doi_pattern, text).group(0) content += f"{doi}\n" # Abstract elif "abstract" in text.lower() and not abstract: abstract = text content += f"{abstract}\n" # Footnotes (small fonts) elif max_font_size < 10: footnotes += text + " " # References elif any(keyword in text.lower() for keyword in reference_keywords): references += text + " " # Tables elif re.search(r"table\s*\d+", text, re.IGNORECASE): content += f"{text}
\n" # Figures elif re.search(r"figure\s*\d+", text, re.IGNORECASE): content += f"
{text}
\n" # Equations (look for math symbols) elif re.search(r"=|∑|√|±|×|π|μ|σ", text): content += f"{text}\n" # ✅ Improved Code Block Detection elif re.search(code_pattern, text) and len(text.split()) <= 50: content += f"{text}\n" # Financial Metrics elif any(fin_kw in text.lower() for fin_kw in financial_keywords): content += f"{text}\n" # Regular Paragraph else: content += f"{text}\n" # Append Footnotes and References if footnotes: content += f"{footnotes.strip()}\n" if references: content += f"{references.strip()}\n" print(f"✅ Finished Processing PDF: {os.path.basename(pdf_path)}") return { "filename": os.path.basename(pdf_path), "title": title, # Include the title in the return data "content": content } def upload_with_progress(file_path, repo_id, token, progress): """ Upload file to Hugging Face Dataset using upload_file() API method. """ print(f"📤 Starting upload of Parquet: {file_path}") file_size = os.path.getsize(file_path) api = HfApi() try: # Use upload_file() method from huggingface_hub api.upload_file( path_or_fileobj=file_path, path_in_repo=os.path.basename(file_path), repo_id=repo_id, repo_type="dataset", token=token ) if progress is not None: progress(1, desc="✅ Upload Complete") print(f"✅ Successfully uploaded to {repo_id}") return f"✅ Successfully uploaded to {repo_id}" except HfHubHTTPError as e: print(f"❌ Upload failed: {e}") return f"❌ Upload failed: {str(e)}" except Exception as e: print(f"❌ Unexpected error: {e}") return f"❌ Unexpected error: {str(e)}" def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()): all_data = [] total_files = len(pdf_files) print("🚀 Starting PDF to Parquet Conversion Process") for idx, pdf_file in enumerate(pdf_files): if progress is not None: progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}") # ✅ Step 1: Process PDF with Full Labels extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress) all_data.append(extracted_data) print("🟡 Converting Processed Data to Parquet") # ✅ Step 2: Convert to Parquet df = pd.DataFrame(all_data) # Generate the parquet file name if len(all_data) == 1: paper_title = all_data[0].get("title", "").strip() if paper_title: safe_title = sanitize_title(paper_title) parquet_file = f"{safe_title}.parquet" else: parquet_file = 'fully_labeled_papers.parquet' else: # For multiple PDFs, include a timestamp to avoid overwrites parquet_file = f"fully_labeled_papers_{time.strftime('%Y%m%d_%H%M%S')}.parquet" try: df.to_parquet(parquet_file, engine='pyarrow', index=False) print("✅ Parquet Conversion Completed") except Exception as e: print(f"❌ Parquet Conversion Failed: {str(e)}") return None, f"❌ Parquet Conversion Failed: {str(e)}" upload_message = "Skipped Upload" # ✅ Step 3: Upload Parquet (if selected) if action_choice in ["Upload to Hugging Face", "Both"]: try: upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress) except Exception as e: print(f"❌ Upload Failed: {str(e)}") upload_message = f"❌ Upload failed: {str(e)}" print("🏁 Process Completed") return parquet_file, upload_message # Define a function for our custom "Reset Files Only" button. def reset_files_fn(): # Return None for both the file input and the output file, clearing them. return None, None with gr.Blocks() as demo: gr.Markdown( """ # PDF to Parquet Converter with Full Labeling **Clear All Inputs:** The button below (labeled "Clear All Inputs") will reset every field, including your API key and dataset repo ID. **Reset Files Only:** Use this button if you want to clear the PDF file uploads and the generated Parquet file, while keeping your credentials intact. """ ) with gr.Row(): pdf_input = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)") with gr.Row(): hf_token = gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token") dataset_repo = gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset") with gr.Row(): action_radio = gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally") with gr.Row(): convert_button = gr.Button("Convert PDF to Parquet") reset_files_button = gr.Button("Reset Files Only") clear_all_button = gr.Button("Clear All Inputs") with gr.Row(): output_file = gr.File(label="Download Parquet File") status_text = gr.Textbox(label="Status") convert_button.click( fn=pdf_to_parquet_and_upload, inputs=[pdf_input, hf_token, dataset_repo, action_radio], outputs=[output_file, status_text] ) reset_files_button.click( fn=reset_files_fn, inputs=None, outputs=[pdf_input, output_file] ) # The Clear All button resets every input field. def clear_all_fn(): return None, None, None, "Download Locally" clear_all_button.click( fn=clear_all_fn, inputs=None, outputs=[pdf_input, hf_token, dataset_repo, action_radio] ) demo.launch()