{title}

import gradio as gr
import pandas as pd
import fitz  # PyMuPDF
import os
import re
from huggingface_hub import HfApi
from huggingface_hub.utils import HfHubHTTPError
import time
import hashlib
import requests

def extract_full_paper_with_labels(pdf_path, progress=None):
    print(f"📄 Starting PDF Processing: {os.path.basename(pdf_path)}")
    doc = fitz.open(pdf_path)
    content = ""

    # Initialize metadata
    title = ""
    authors = ""
    year = ""
    doi = ""
    abstract = ""
    footnotes = ""
    references = ""
    sources = ""
    total_pages = len(doc)
    max_iterations = total_pages * 2  # To prevent infinite loops
    iteration_count = 0

    # Regex patterns for detection
    doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
    year_pattern = r'\b(19|20)\d{2}\b'
    code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{|\}|;)"
    reference_keywords = ['reference', 'bibliography', 'sources']
    financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']

    for page_num, page in enumerate(doc):
        iteration_count += 1
        if iteration_count > max_iterations:
            raise Exception("⚠️ PDF processing exceeded iteration limit. Possible malformed PDF.")

        if progress is not None:
            progress((page_num + 1) / total_pages, desc=f"Processing Page {page_num + 1}/{total_pages}")

        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:
                text = ""
                max_font_size = 0
                for line in block["lines"]:
                    for span in line["spans"]:
                        text += span["text"] + " "
                        if span["size"] > max_font_size:
                            max_font_size = span["size"]

                text = text.strip()

                # Title (First Page, Largest Font)
                if page_num == 0 and max_font_size > 15 and not title:
                    title = text
                    content += f"<TITLE>{title}</TITLE>\n"

                # Authors
                elif re.search(r'author|by', text, re.IGNORECASE) and not authors:
                    authors = text
                    content += f"<AUTHORS>{authors}</AUTHORS>\n"

                # Year
                elif re.search(year_pattern, text) and not year:
                    year = re.search(year_pattern, text).group(0)
                    content += f"<YEAR>{year}</YEAR>\n"

                # DOI
                elif re.search(doi_pattern, text) and not doi:
                    doi = re.search(doi_pattern, text).group(0)
                    content += f"<DOI>{doi}</DOI>\n"

                # Abstract
                elif "abstract" in text.lower() and not abstract:
                    abstract = text
                    content += f"<ABSTRACT>{abstract}</ABSTRACT>\n"

                # Footnotes (small fonts)
                elif max_font_size < 10:
                    footnotes += text + " "

                # References
                elif any(keyword in text.lower() for keyword in reference_keywords):
                    references += text + " "

                # Tables
                elif re.search(r"table\s*\d+", text, re.IGNORECASE):
                    content += f"<TABLE>{text}</TABLE>\n"

                # Figures
                elif re.search(r"figure\s*\d+", text, re.IGNORECASE):
                    content += f"<FIGURE>{text}</FIGURE>\n"

                # Equations (look for math symbols)
                elif re.search(r"=|∑|√|±|×|π|μ|σ", text):
                    content += f"<EQUATION>{text}</EQUATION>\n"

                # Code Blocks (enhanced detection)
                elif re.search(code_pattern, text) and len(text.split()) <= 50:
                    content += f"<CODE>{text}</CODE>\n"

                # Financial Metrics
                elif any(fin_kw in text.lower() for fin_kw in financial_keywords):
                    content += f"<FINANCIAL_METRIC>{text}</FINANCIAL_METRIC>\n"

                # Regular Paragraph
                else:
                    content += f"<PARAGRAPH>{text}</PARAGRAPH>\n"

    # Append Footnotes and References
    if footnotes:
        content += f"<FOOTNOTE>{footnotes.strip()}</FOOTNOTE>\n"
    if references:
        content += f"<REFERENCE>{references.strip()}</REFERENCE>\n"

    print(f"✅ Finished Processing PDF: {os.path.basename(pdf_path)}")
    return {
        "filename": os.path.basename(pdf_path),
        "title": title if title else "Untitled_Paper",
        "content": content
    }


def upload_with_progress(file_path, repo_id, token, progress):
    """
    Upload file to Hugging Face Dataset with progress tracking.
    """

    print(f"📤 Starting upload of Parquet: {file_path}")
    file_size = os.path.getsize(file_path)
    api = HfApi()

    # Get the proper upload URL from the Hugging Face API
    upload_url = f"https://huggingface.co/api/datasets/{repo_id}/upload"

    try:
        with open(file_path, 'rb') as f:
            chunk_size = 1024 * 1024  # 1 MB chunks
            uploaded = 0

            headers = {
                "Authorization": f"Bearer {token}",
                "Content-Type": "application/octet-stream"
            }

            while True:
                chunk = f.read(chunk_size)
                if not chunk:
                    break  # Finished reading file

                response = requests.put(upload_url, headers=headers, data=chunk)

                if response.status_code != 200:
                    raise Exception(f"Upload failed: {response.text}")

                # Update progress after each chunk
                uploaded += len(chunk)
                if progress is not None:
                    progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")

        # Final progress update
        if progress is not None:
            progress(1, desc="✅ Upload Complete")

        print(f"✅ Successfully uploaded to {repo_id}")
        return f"✅ Successfully uploaded to {repo_id}"

    except HfHubHTTPError as e:
        print(f"❌ Upload failed: {e}")
        return f"❌ Upload failed: {str(e)}"
    except Exception as e:
        print(f"❌ Unexpected error: {e}")
        return f"❌ Unexpected error: {str(e)}"


def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
    upload_message = ""

    # ✅ Helper function inside this block to avoid external edits
    def sanitize_filename(title, max_length=100):
        """
        Sanitize and truncate the filename to avoid OS limits.
        """
        # Remove invalid characters
        sanitized = re.sub(r'[\\/*?:"<>|]', "", title)
        sanitized = sanitized.replace(" ", "_")

        # Truncate to max_length if necessary
        if len(sanitized) > max_length:
            # Append an 8-character hash for uniqueness
            hash_suffix = hashlib.md5(sanitized.encode()).hexdigest()[:8]
            sanitized = sanitized[:max_length] + "_" + hash_suffix

        return sanitized

    total_files = len(pdf_files)
    print("🚀 Starting PDF to Parquet Conversion Process")

    for idx, pdf_file in enumerate(pdf_files):
        if progress is not None:
            progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")

        # ✅ Step 1: Process PDF with Full Labels
        extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)

        # ✅ Step 2: Use Title for Parquet Filename with Truncation & Hash
        sanitized_title = sanitize_filename(extracted_data["title"])
        parquet_file = f"{sanitized_title}.parquet"

        # Convert to DataFrame
        df = pd.DataFrame([extracted_data])

        try:
            df.to_parquet(parquet_file, engine='pyarrow', index=False)
            print(f"✅ Parquet saved as: {parquet_file}")
        except Exception as e:
            print(f"❌ Parquet Conversion Failed: {str(e)}")
            return None, f"❌ Parquet Conversion Failed: {str(e)}"

        # ✅ Step 3: Upload Parquet (if selected)
        if action_choice in ["Upload to Hugging Face", "Both"]:
            try:
                upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
            except Exception as e:
                print(f"❌ Upload Failed: {str(e)}")
                upload_message = f"❌ Upload failed: {str(e)}"

    print("🏁 Process Completed")
    return parquet_file, upload_message

# ✅ Gradio Interface
iface = gr.Interface(
    fn=pdf_to_parquet_and_upload,
    inputs=[
        gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"),
        gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"),
        gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"),
        gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
    ],
    outputs=[
        gr.File(label="Download Parquet File"), 
        gr.Textbox(label="Status")
    ],
    title="PDF to Parquet Converter with Title-Based Naming",
    description="Upload your PDFs, convert them to Parquet files named after the paper title, and upload to your Hugging Face Dataset."
)

iface.launch()