import gradio as gr
from pdfminer.high_level import extract_text
from langchain_groq import ChatGroq
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import os
import markdown2

# Retrieve API keys from Hugging Face Spaces secrets
openai_api_key = os.environ.get('OPENAI_API_KEY')
groq_api_key = os.environ.get('GROQ_API_KEY')
google_api_key = os.environ.get('GEMINI_API_KEY')

# Initialize API clients with the API keys
openai_client = ChatOpenAI(model_name="gpt-4o", api_key=openai_api_key)
groq_client = ChatGroq(model="llama-3.1-70b-versatile", temperature=0, api_key=groq_api_key)
gemini_client = ChatGoogleGenerativeAI(model="gemini-1.5-pro", api_key=google_api_key)

# Function to extract text from PDF
def extract_pdf(pdf_path):
    try:
        return extract_text(pdf_path)
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {str(e)}")
        return ""

# Function to split text into chunks
def split_text(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    return [Document(page_content=t) for t in splitter.split_text(text)]

# Function to generate embeddings and store in vector database
def generate_embeddings(docs):
    embeddings = OpenAIEmbeddings(api_key=openai_api_key)
    return FAISS.from_documents(docs, embeddings)

# Function for query preprocessing
def preprocess_query(query):
    prompt = ChatPromptTemplate.from_template("""
    Transform the following query into a more detailed, keyword-rich affitmative statement that could appear in official data protection regulation documents:
    Query: {query}
    Transformed query:
    """)
    chain = prompt | openai_client
    return chain.invoke({"query": query}).content

# Function to create RAG chain with Groq
def create_rag_chain(vector_store):
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an AI assistant helping with data protection and regulation compliance related queries. Use the following context to answer the user's question:\n\n{context}"),
        ("human", "{input}")
    ])
    document_chain = create_stuff_documents_chain(groq_client, prompt)
    return create_retrieval_chain(vector_store.as_retriever(), document_chain)

# Function for Gemini response with long context
def gemini_response(query, full_pdf_content):
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an AI assistant helping with data protection and regulation compliance related queries.. Use the following full content of official regulation documents to answer the user's question:\n\n{context}"),
        ("human", "{input}")
    ])
    chain = prompt | gemini_client
    return chain.invoke({"context": full_pdf_content, "input": query}).content

# Function to generate final response
def generate_final_response(query, response1, response2):
    prompt = ChatPromptTemplate.from_template("""
    As an AI assistant specializing in data protection and compliance for educators:
    [hidden states, scrartchpad]
    1. Analyze for yourself the following two AI-generated responses to the user query.
    2. Think of a comprehensive answer that combines the strengths of both responses.
    3. If the responses contradict each other, highlight this and if it might indicate a hallucination.
    [Output]
    4. Provide practical advice on how to meet regulatory requirements in the context of the user question based on the information given.

    User Query: {query}

    Response 1: {response1}

    Response 2: {response2}

    Your synthesized response:
    """)
    chain = prompt | openai_client
    return chain.invoke({"query": query, "response1": response1, "response2": response2}).content

# Function to process the query
def process_query(user_query):
    try:
        preprocessed_query = preprocess_query(user_query)
        print(f"Original query: {user_query}")
        print(f"Preprocessed query: {preprocessed_query}")
        
        rag_response = rag_chain.invoke({"input": preprocessed_query})["answer"]
        gemini_resp = gemini_response(preprocessed_query, full_pdf_content)
        final_response = generate_final_response(user_query, rag_response, gemini_resp)
        
        return rag_response, gemini_resp, html_content
    except Exception as e:
        error_message = f"An error occurred: {str(e)}"
        return error_message, error_message, error_message

# Initialize
pdf_paths = ["GDPR.pdf", "FERPA.pdf", "COPPA.pdf"]
full_pdf_content = ""
all_documents = []

for pdf_path in pdf_paths:
    extracted_text = extract_pdf(pdf_path)
    full_pdf_content += extracted_text + "\n\n"
    all_documents.extend(split_text(extracted_text))

vector_store = generate_embeddings(all_documents)
rag_chain = create_rag_chain(vector_store)

# Gradio interface
iface = gr.Interface(
    fn=process_query,
    inputs=gr.Textbox(label="Ask your data protection related question"),
    outputs=[
        gr.Textbox(label="RAG Pipeline (Llama3.1) Response"),
        gr.Textbox(label="Long Context (Gemini 1.5 Pro) Response"),
        gr.Textbox(label="Final (GPT-4o) Response")
    ],
    title="Data Protection Team",
    description="Get responses combining advanced RAG, Long Context, and SOTA models to data protection related questions (GDPR, FERPA, COPPA).",
    allow_flagging="never" 
)

# Launch the interface
iface.launch()