import gradio as gr from pdfminer.high_level import extract_text from langchain_groq import ChatGroq from langchain_google_genai import ChatGoogleGenerativeAI from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import FAISS from langchain.embeddings import OpenAIEmbeddings from langchain.schema import Document from langchain_openai import ChatOpenAI from langchain.prompts import ChatPromptTemplate from langchain.chains.combine_documents import create_stuff_documents_chain from langchain.chains import create_retrieval_chain import os import markdown2 # Retrieve API keys from HF secrets openai_api_key = os.getenv('OPENAI_API_KEY') groq_api_key = os.getenv('GROQ_API_KEY') google_api_key = os.getenv('GEMINI_API_KEY') # Initialize API clients with the API keys openai_client = ChatOpenAI(model_name="gpt-4o", api_key=openai_api_key) groq_client = ChatGroq(model="llama-3.1-70b-versatile", temperature=0, api_key=groq_api_key) gemini_client = ChatGoogleGenerativeAI(model="gemini-1.5-pro", api_key=google_api_key) # Define paths for regulation PDFs regulation_pdfs = { "GDPR": "GDPR.pdf", "FERPA": "FERPA.pdf", "COPPA": "COPPA.pdf" } # Function to extract text from PDF def extract_pdf(pdf_path): try: return extract_text(pdf_path) except Exception as e: print(f"Error extracting text from {pdf_path}: {str(e)}") return "" # ... (other functions remain unchanged) def load_pdfs(gdpr, ferpa, coppa, additional_pdfs): global full_pdf_content, vector_store, rag_chain documents = [] full_pdf_content = "" # Load selected regulation PDFs selected_regulations = [] if gdpr: selected_regulations.append("GDPR") if ferpa: selected_regulations.append("FERPA") if coppa: selected_regulations.append("COPPA") for regulation in selected_regulations: if regulation in regulation_pdfs: pdf_path = regulation_pdfs[regulation] if os.path.exists(pdf_path): pdf_content = extract_pdf(pdf_path) if pdf_content: full_pdf_content += pdf_content + "\n\n" documents.extend(split_text(pdf_content)) print(f"Loaded {regulation} PDF") else: print(f"Failed to extract content from {regulation} PDF") else: print(f"PDF file for {regulation} not found at {pdf_path}") # Load additional user-uploaded PDFs if additional_pdfs is not None: for pdf_file in additional_pdfs: pdf_content = extract_pdf(pdf_file.name) if pdf_content: full_pdf_content += pdf_content + "\n\n" documents.extend(split_text(pdf_content)) print(f"Loaded additional PDF: {pdf_file.name}") else: print(f"Failed to extract content from uploaded PDF: {pdf_file.name}") if not documents: return "No PDFs were successfully loaded. Please check your selections and uploads." print(f"Total documents loaded: {len(documents)}") print(f"Total content length: {len(full_pdf_content)} characters") vector_store = generate_embeddings(documents) rag_chain = create_rag_chain(vector_store) return f"PDFs loaded and RAG system updated successfully! Loaded {len(documents)} document chunks." def process_query(user_query): global rag_chain, full_pdf_content if rag_chain is None or not full_pdf_content: return ("Please load PDFs before asking questions.", "Please load PDFs before asking questions.", "Please load PDFs and initialize the system before asking questions.") preprocessed_query = preprocess_query(user_query) # Get RAG response using Groq rag_response = rag_chain.invoke({"input": preprocessed_query})["answer"] # Get Gemini response with full PDF content gemini_resp = gemini_response(preprocessed_query, full_pdf_content) final_response = generate_final_response(rag_response, gemini_resp) html_content = markdown_to_html(final_response) return rag_response, gemini_resp, html_content # Initialize full_pdf_content = "" vector_store = None rag_chain = None # Gradio interface with gr.Blocks() as iface: gr.Markdown("# Data Protection Team") gr.Markdown("Get responses combining advanced RAG, Long Context, and SOTA models to data protection related questions.") with gr.Row(): gdpr_checkbox = gr.Checkbox(label="GDPR (EU)") ferpa_checkbox = gr.Checkbox(label="FERPA (US)") coppa_checkbox = gr.Checkbox(label="COPPA (US <13)") gr.Markdown("**Optional: upload additional PDFs if needed (national regulation, school policy)**") additional_pdfs = gr.File( file_count="multiple", label="Upload additional PDFs", file_types=[".pdf"], elem_id="file_upload" ) load_button = gr.Button("Load PDFs") load_output = gr.Textbox(label="Load Status") gr.Markdown("**Ask your data protection related question**") query_input = gr.Textbox(label="Your Question", placeholder="Ask your question here...") query_button = gr.Button("Submit Query") gr.Markdown("**Results**") rag_output = gr.Textbox(label="RAG Pipeline (Llama3.1) Response") gemini_output = gr.Textbox(label="Long Context (Gemini 1.5 Pro) Response") final_output = gr.HTML(label="Final (GPT-4o) Response") load_button.click( load_pdfs, inputs=[ gdpr_checkbox, ferpa_checkbox, coppa_checkbox, additional_pdfs ], outputs=load_output ) query_button.click( process_query, inputs=query_input, outputs=[rag_output, gemini_output, final_output] ) iface.launch()