import gradio as gr
import easyocr
import cv2
import numpy as np
from PIL import Image
import pdf2image
import tempfile
from langchain_groq import ChatGroq
import re
import os

# Initialize OCR reader
reader = easyocr.Reader(['en'])

# Initialize LLM
llm = ChatGroq(
    temperature=0,
    groq_api_key= os.getenv("GROQ_API_KEY"),
    model_name="llama-3.3-70b-versatile"
)

# Utility to clean up unwanted characters
def clean_text(text):
    text = re.sub(r"[*•●▪️✦➡️~]+", "", text)  # remove bullet points and asterisks
    text = re.sub(r"\s{2,}", " ", text)  # remove excessive spacing
    return text.strip()

def extract_text_and_summarize(file):
    file_path = file.name

    # If it's a PDF, convert to image
    if file_path.lower().endswith(".pdf"):
        images = pdf2image.convert_from_path(file_path)
        image = np.array(images[0])
    else:
        image = cv2.imread(file_path)

    # OCR
    results = reader.readtext(image)
    extracted_text = ' '.join([text[1] for text in results])
    extracted_text = clean_text(extracted_text)

    if not extracted_text.strip():
        return "No readable text found.", ""

    # LLM summarization
    messages = [
        {"role": "system", "content": "Your job is to summarize the given research paper and list its key sub-domains and topics clearly."},
        {"role": "user", "content": extracted_text}
    ]
    result = llm.invoke(messages)

    summarized_text = clean_text(result.content)
    return extracted_text, summarized_text

# Build Gradio UI
with gr.Blocks(title="Research Paper Summarizer") as iface:
    gr.Markdown(
        """
        # 🧠 Research Paper Summarizer
        Upload an **image** or **PDF** of a research paper. This app will:
        - Extract text using OCR
        - Summarize the content
        - List key subdomains and research topics

        ⚡ Powered by EasyOCR & LLaMA-3 via Groq
        """
    )

    with gr.Row():
        file_input = gr.File(label="📄 Upload Research Paper (Image or PDF)", file_types=[".png", ".jpg", ".jpeg", ".pdf"])

    with gr.Row():
        extracted_box = gr.Textbox(label="🔍 Extracted Text", lines=10, interactive=False)
        summary_box = gr.Textbox(label="📌 Summarized Topics & Subdomains", lines=10, interactive=False)

    run_button = gr.Button("🔁 Run Summarizer")

    run_button.click(fn=extract_text_and_summarize, inputs=file_input, outputs=[extracted_box, summary_box])

iface.launch(share=True)