import gradio as gr from pdfminer.high_level import extract_text from langchain_groq import ChatGroq from langchain_google_genai import ChatGoogleGenerativeAI from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import FAISS from langchain.embeddings import OpenAIEmbeddings from langchain.schema import Document from langchain_openai import ChatOpenAI from langchain.prompts import ChatPromptTemplate from langchain.chains.combine_documents import create_stuff_documents_chain from langchain.chains import create_retrieval_chain import os import markdown2 # Retrieve API keys from Hugging Face Spaces secrets openai_api_key = os.environ.get('OPENAI_API_KEY') groq_api_key = os.environ.get('GROQ_API_KEY') google_api_key = os.environ.get('GEMINI_API_KEY') # Initialize API clients with the API keys openai_client = ChatOpenAI(model_name="gpt-4o", api_key=openai_api_key) groq_client = ChatGroq(model="llama-3.1-70b-versatile", temperature=0, api_key=groq_api_key) gemini_client = ChatGoogleGenerativeAI(model="gemini-1.5-pro", api_key=google_api_key) # Function to extract text from PDF def extract_pdf(pdf_path): try: return extract_text(pdf_path) except Exception as e: print(f"Error extracting text from {pdf_path}: {str(e)}") return "" # Function to split text into chunks def split_text(text): splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) return [Document(page_content=t) for t in splitter.split_text(text)] # Function to generate embeddings and store in vector database def generate_embeddings(docs): embeddings = OpenAIEmbeddings(api_key=openai_api_key) return FAISS.from_documents(docs, embeddings) # Function for query preprocessing def preprocess_query(query): prompt = ChatPromptTemplate.from_template(""" Transform the following query into a more detailed, keyword-rich statement that could appear in official data protection regulation documents: Query: {query} Transformed query: """) chain = prompt | openai_client return chain.invoke({"query": query}).content # Function to create RAG chain with Groq def create_rag_chain(vector_store): prompt = ChatPromptTemplate.from_messages([ ("system", "You are an AI assistant helping with data protection and regulation compliance related queries. Use the following context to answer the user's question:\n\n{context}"), ("human", "{input}") ]) document_chain = create_stuff_documents_chain(groq_client, prompt) return create_retrieval_chain(vector_store.as_retriever(), document_chain) # Function for Gemini response with long context def gemini_response(query, full_pdf_content): prompt = ChatPromptTemplate.from_messages([ ("system", "You are an AI assistant helping with data protection and regulation compliance related queries.. Use the following full content of official regulation documents to answer the user's question:\n\n{context}"), ("human", "{input}") ]) chain = prompt | gemini_client return chain.invoke({"context": full_pdf_content, "input": query}).content # Function to generate final response def generate_final_response(query, response1, response2): prompt = ChatPromptTemplate.from_template(""" As an AI assistant specializing in data protection and compliance for educators: 1. Analyze the following two AI-generated responses to the user query. 2. Synthesize a comprehensive answer that combines the strengths of both responses. 3. If the responses contradict each other, highlight this and explain potential reasons. 4. Provide practical advice on how to meet regulatory requirements in the context of the user question based on the information given. User Query: {query} Response 1: {response1} Response 2: {response2} Your synthesized response: """) chain = prompt | openai_client return chain.invoke({"query": query, "response1": response1, "response2": response2}).content # Function to process the query def process_query(user_query): try: preprocessed_query = preprocess_query(user_query) print(f"Original query: {user_query}") print(f"Preprocessed query: {preprocessed_query}") rag_response = rag_chain.invoke({"input": preprocessed_query})["answer"] gemini_resp = gemini_response(preprocessed_query, full_pdf_content) final_response = generate_final_response(user_query, rag_response, gemini_resp) html_content = markdown2.markdown(final_response) return rag_response, gemini_resp, html_content except Exception as e: error_message = f"An error occurred: {str(e)}" return error_message, error_message, error_message # Initialize pdf_paths = ["GDPR.pdf", "FERPA.pdf", "COPPA.pdf"] full_pdf_content = "" all_documents = [] for pdf_path in pdf_paths: extracted_text = extract_pdf(pdf_path) full_pdf_content += extracted_text + "\n\n" all_documents.extend(split_text(extracted_text)) vector_store = generate_embeddings(all_documents) rag_chain = create_rag_chain(vector_store) # Gradio interface iface = gr.Interface( fn=process_query, inputs=gr.Textbox(label="Ask your data protection related question"), outputs=[ gr.Textbox(label="RAG Pipeline (Llama3.1) Response"), gr.Textbox(label="Long Context (Gemini 1.5 Pro) Response"), gr.HTML(label="Final (GPT-4o) Response") ], title="Data Protection Team", description="Get responses combining advanced RAG, Long Context, and SOTA models to data protection related questions (GDPR, FERPA, COPPA).", allow_flagging="never" ) # Launch the interface iface.launch()