Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from pdfminer.high_level import extract_text | |
| from langchain_groq import ChatGroq | |
| from langchain_google_genai import ChatGoogleGenerativeAI | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores import FAISS | |
| from langchain.embeddings import OpenAIEmbeddings | |
| from langchain.schema import Document | |
| from langchain_openai import ChatOpenAI | |
| from langchain.prompts import ChatPromptTemplate | |
| from langchain.chains.combine_documents import create_stuff_documents_chain | |
| from langchain.chains import create_retrieval_chain | |
| import os | |
| import markdown2 | |
| # Retrieve API keys from HF secrets | |
| openai_api_key = os.getenv('OPENAI_API_KEY') | |
| groq_api_key = os.getenv('GROQ_API_KEY') | |
| google_api_key = os.getenv('GEMINI_API_KEY') | |
| # Initialize API clients with the API keys | |
| openai_client = ChatOpenAI(model_name="gpt-4o", api_key=openai_api_key) | |
| groq_client = ChatGroq(model="llama-3.1-70b-versatile", temperature=0, api_key=groq_api_key) | |
| gemini_client = ChatGoogleGenerativeAI(model="gemini-1.5-pro", api_key=google_api_key) | |
| # Define paths for regulation PDFs | |
| regulation_pdfs = { | |
| "GDPR": "GDPR.pdf", | |
| "FERPA": "FERPA.pdf", | |
| "COPPA": "COPPA.pdf" | |
| } | |
| # Function to extract text from PDF | |
| def extract_pdf(pdf_path): | |
| try: | |
| return extract_text(pdf_path) | |
| except Exception as e: | |
| print(f"Error extracting text from {pdf_path}: {str(e)}") | |
| return "" | |
| # ... (other functions remain unchanged) | |
| def load_pdfs(gdpr, ferpa, coppa, additional_pdfs): | |
| global full_pdf_content, vector_store, rag_chain | |
| documents = [] | |
| full_pdf_content = "" | |
| # Load selected regulation PDFs | |
| selected_regulations = [] | |
| if gdpr: | |
| selected_regulations.append("GDPR") | |
| if ferpa: | |
| selected_regulations.append("FERPA") | |
| if coppa: | |
| selected_regulations.append("COPPA") | |
| for regulation in selected_regulations: | |
| if regulation in regulation_pdfs: | |
| pdf_path = regulation_pdfs[regulation] | |
| if os.path.exists(pdf_path): | |
| pdf_content = extract_pdf(pdf_path) | |
| if pdf_content: | |
| full_pdf_content += pdf_content + "\n\n" | |
| documents.extend(split_text(pdf_content)) | |
| print(f"Loaded {regulation} PDF") | |
| else: | |
| print(f"Failed to extract content from {regulation} PDF") | |
| else: | |
| print(f"PDF file for {regulation} not found at {pdf_path}") | |
| # Load additional user-uploaded PDFs | |
| if additional_pdfs is not None: | |
| for pdf_file in additional_pdfs: | |
| pdf_content = extract_pdf(pdf_file.name) | |
| if pdf_content: | |
| full_pdf_content += pdf_content + "\n\n" | |
| documents.extend(split_text(pdf_content)) | |
| print(f"Loaded additional PDF: {pdf_file.name}") | |
| else: | |
| print(f"Failed to extract content from uploaded PDF: {pdf_file.name}") | |
| if not documents: | |
| return "No PDFs were successfully loaded. Please check your selections and uploads." | |
| print(f"Total documents loaded: {len(documents)}") | |
| print(f"Total content length: {len(full_pdf_content)} characters") | |
| vector_store = generate_embeddings(documents) | |
| rag_chain = create_rag_chain(vector_store) | |
| return f"PDFs loaded and RAG system updated successfully! Loaded {len(documents)} document chunks." | |
| def process_query(user_query): | |
| global rag_chain, full_pdf_content | |
| if rag_chain is None or not full_pdf_content: | |
| return ("Please load PDFs before asking questions.", | |
| "Please load PDFs before asking questions.", | |
| "Please load PDFs and initialize the system before asking questions.") | |
| preprocessed_query = preprocess_query(user_query) | |
| # Get RAG response using Groq | |
| rag_response = rag_chain.invoke({"input": preprocessed_query})["answer"] | |
| # Get Gemini response with full PDF content | |
| gemini_resp = gemini_response(preprocessed_query, full_pdf_content) | |
| final_response = generate_final_response(rag_response, gemini_resp) | |
| html_content = markdown_to_html(final_response) | |
| return rag_response, gemini_resp, html_content | |
| # Initialize | |
| full_pdf_content = "" | |
| vector_store = None | |
| rag_chain = None | |
| # Gradio interface | |
| with gr.Blocks() as iface: | |
| gr.Markdown("# Data Protection Team") | |
| gr.Markdown("Get responses combining advanced RAG, Long Context, and SOTA models to data protection related questions.") | |
| with gr.Row(): | |
| gdpr_checkbox = gr.Checkbox(label="GDPR (EU)") | |
| ferpa_checkbox = gr.Checkbox(label="FERPA (US)") | |
| coppa_checkbox = gr.Checkbox(label="COPPA (US <13)") | |
| gr.Markdown("**Optional: upload additional PDFs if needed (national regulation, school policy)**") | |
| additional_pdfs = gr.File( | |
| file_count="multiple", | |
| label="Upload additional PDFs", | |
| file_types=[".pdf"], | |
| elem_id="file_upload" | |
| ) | |
| load_button = gr.Button("Load PDFs") | |
| load_output = gr.Textbox(label="Load Status") | |
| gr.Markdown("**Ask your data protection related question**") | |
| query_input = gr.Textbox(label="Your Question", placeholder="Ask your question here...") | |
| query_button = gr.Button("Submit Query") | |
| gr.Markdown("**Results**") | |
| rag_output = gr.Textbox(label="RAG Pipeline (Llama3.1) Response") | |
| gemini_output = gr.Textbox(label="Long Context (Gemini 1.5 Pro) Response") | |
| final_output = gr.HTML(label="Final (GPT-4o) Response") | |
| load_button.click( | |
| load_pdfs, | |
| inputs=[ | |
| gdpr_checkbox, | |
| ferpa_checkbox, | |
| coppa_checkbox, | |
| additional_pdfs | |
| ], | |
| outputs=load_output | |
| ) | |
| query_button.click( | |
| process_query, | |
| inputs=query_input, | |
| outputs=[rag_output, gemini_output, final_output] | |
| ) | |
| iface.launch() |