import gradio as gr
import pandas as pd
import fitz  # PyMuPDF
import os
from huggingface_hub import HfApi
from huggingface_hub.utils import HfHubHTTPError
import requests
import time

def extract_paragraphs_with_headers(pdf_path, progress=None):
    print(f"📄 Starting PDF Processing: {os.path.basename(pdf_path)}")
    doc = fitz.open(pdf_path)
    data = []

    total_pages = len(doc)
    max_iterations = total_pages * 2  # To prevent infinite loops
    iteration_count = 0

    for page_num, page in enumerate(doc):
        iteration_count += 1
        if iteration_count > max_iterations:
            raise Exception("⚠️ PDF processing exceeded iteration limit. Possible malformed PDF.")

        if progress is not None:
            progress((page_num + 1) / total_pages, desc=f"Processing Page {page_num + 1}/{total_pages}")

        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:
                text = ""
                for line in block["lines"]:
                    for span in line["spans"]:
                        text += span["text"] + " "

                text = text.strip()

                # Detect headers based on font size
                is_header = any(span["size"] > 15 for line in block["lines"] for span in line["spans"])

                data.append({
                    "page_num": page_num + 1,
                    "text": text,
                    "is_header": is_header
                })

    print(f"✅ Finished Processing PDF: {os.path.basename(pdf_path)}")
    return data

def upload_with_progress(file_path, repo_id, token, progress):
    """
    Upload file to Hugging Face Dataset with progress tracking.
    """
    print(f"📤 Starting upload of Parquet: {file_path}")
    file_size = os.path.getsize(file_path)
    url = f"https://huggingface.co/api/datasets/{repo_id}/upload"

    headers = {
        "Authorization": f"Bearer {token}"
    }

    with open(file_path, 'rb') as f:
        chunk_size = 1024 * 1024  # 1MB
        uploaded = 0
        max_chunks = file_size // chunk_size + 10  # Safety limit to avoid infinite loops
        chunk_count = 0

        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break

            response = requests.put(
                url,
                headers=headers,
                data=chunk
            )

            uploaded += len(chunk)
            if progress is not None:
                progress(uploaded / file_size, desc=f"Uploading... {uploaded // (1024 * 1024)}MB/{file_size // (1024 * 1024)}MB")
            time.sleep(0.1)  # Smooth progress update

            chunk_count += 1
            if chunk_count > max_chunks:
                raise Exception("⚠️ Upload exceeded expected chunk limit. Aborting.")

            if response.status_code != 200:
                raise Exception(f"❌ Upload failed: {response.text}")

    print(f"✅ Successfully uploaded to {repo_id}")
    return f"✅ Successfully uploaded to {repo_id}"

def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choice, progress=gr.Progress()):
    all_data = []

    total_files = len(pdf_files)
    print("🚀 Starting PDF to Parquet Conversion Process")

    for idx, pdf_file in enumerate(pdf_files):
        if progress is not None:
            progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")

        # ✅ Step 1: Process PDF
        extracted_data = extract_paragraphs_with_headers(pdf_file.name, progress=progress)
        for item in extracted_data:
            all_data.append({
                'filename': os.path.basename(pdf_file.name),
                'page_num': item['page_num'],
                'text': item['text'],
                'is_header': item['is_header']
            })

    print("🟡 Converting Processed Data to Parquet")
    # ✅ Step 2: Convert to Parquet
    df = pd.DataFrame(all_data)
    parquet_file = 'papers_with_headers.parquet'

    try:
        df.to_parquet(parquet_file, engine='pyarrow', index=False)
        print("✅ Parquet Conversion Completed")
    except Exception as e:
        print(f"❌ Parquet Conversion Failed: {str(e)}")
        return None, f"❌ Parquet Conversion Failed: {str(e)}"

    upload_message = "Skipped Upload"

    # ✅ Step 3: Upload Parquet (if selected)
    if action_choice in ["Upload to Hugging Face", "Both"]:
        try:
            upload_message = upload_with_progress(parquet_file, dataset_repo_id, hf_token, progress)
        except Exception as e:
            print(f"❌ Upload Failed: {str(e)}")
            upload_message = f"❌ Upload failed: {str(e)}"

    print("🏁 Process Completed")
    return parquet_file, upload_message

# ✅ Gradio Interface
iface = gr.Interface(
    fn=pdf_to_parquet_and_upload,
    inputs=[
        gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs (Drag & Drop or Search)"),
        gr.Textbox(label="Hugging Face API Token", type="password", placeholder="Enter your Hugging Face API token"),
        gr.Textbox(label="Your Dataset Repo ID (e.g., username/research-dataset)", placeholder="username/research-dataset"),
        gr.Radio(["Download Locally", "Upload to Hugging Face", "Both"], label="Action", value="Download Locally")
    ],
    outputs=[
        gr.File(label="Download Parquet File"), 
        gr.Textbox(label="Status")
    ],
    title="PDF to Parquet Converter with Detailed Progress",
    description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset with clear progress indicators."
)

iface.launch()