Spaces:

jeremierostan
/

Data_Protection_Team

Sleeping

App Files Files Community

jeremierostan commited on Jul 27, 2024

Commit

f7cfaf7

verified ·

1 Parent(s): 6c30aba

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -63

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ from langchain.chains import create_retrieval_chain
 import os
 import markdown2
-# Retrieve API keys from HF secrets
 openai_api_key = os.getenv('OPENAI_API_KEY')
 groq_api_key = os.getenv('GROQ_API_KEY')
 google_api_key = os.getenv('GEMINI_API_KEY')
@@ -36,53 +36,6 @@ vector_store = None
 rag_chain = None
 pdfs_loaded = False
-# Function to load regulations with checked boxes or uploaded
-def load_pdfs(selected_regulations, additional_pdfs):
-    global full_pdf_content, vector_store, rag_chain, pdfs_loaded
-    documents = []
-    full_pdf_content = ""
-    print(f"Selected regulations: {selected_regulations}")  # Debug print
-    for regulation in selected_regulations:
-        if regulation in regulation_pdfs:
-            pdf_path = regulation_pdfs[regulation]
-            if os.path.exists(pdf_path):
-                pdf_content = extract_pdf(pdf_path)
-                if pdf_content:
-                    full_pdf_content += pdf_content + "\n\n"
-                    documents.extend(split_text(pdf_content))
-                    print(f"Loaded {regulation} PDF")
-                else:
-                    print(f"Failed to extract content from {regulation} PDF")
-            else:
-                print(f"PDF file for {regulation} not found at {pdf_path}")
-    # Load additional user-uploaded PDFs
-    if additional_pdfs is not None:
-        for pdf_file in additional_pdfs:
-            pdf_content = extract_pdf(pdf_file.name)
-            if pdf_content:
-                full_pdf_content += pdf_content + "\n\n"
-                documents.extend(split_text(pdf_content))
-                print(f"Loaded additional PDF: {pdf_file.name}")
-            else:
-                print(f"Failed to extract content from uploaded PDF: {pdf_file.name}")
-    if not documents:
-        pdfs_loaded = False
-        return "No PDFs were successfully loaded. Please check your selections and uploads."
-    print(f"Total documents loaded: {len(documents)}")
-    print(f"Total content length: {len(full_pdf_content)} characters")
-    vector_store = generate_embeddings(documents)
-    rag_chain = create_rag_chain(vector_store)
-    pdfs_loaded = True
-    return f"PDFs loaded and RAG system updated successfully! Loaded {len(documents)} document chunks."
 # Function to extract text from PDF
 def extract_pdf(pdf_path):
     try:
@@ -146,7 +99,62 @@ def generate_final_response(response1, response2):
     chain = prompt | openai_client
     return chain.invoke({"response1": response1, "response2": response2}).content
-# Function to process the query
 def process_query(user_query):
     global rag_chain, full_pdf_content, pdfs_loaded
@@ -168,19 +176,15 @@ def process_query(user_query):
     return rag_response, gemini_resp, html_content
-# Function to output the final response as markdown
-def markdown_to_html(content):
-    return markdown2.markdown(content)
 # Gradio interface
 with gr.Blocks() as iface:
     gr.Markdown("# Data Protection Team")
     gr.Markdown("Get responses combining advanced RAG, Long Context, and SOTA models to data protection related questions.")
-    regulations_checkboxes = gr.CheckboxGroup(
-        choices=["GDPR (EU)", "FERPA (US)", "COPPA (US <13)"],
-        label="Select Regulations"
-    )
     gr.Markdown("**Optional: upload additional PDFs if needed (national regulation, school policy)**")
     additional_pdfs = gr.File(
@@ -202,12 +206,14 @@ with gr.Blocks() as iface:
     gemini_output = gr.Textbox(label="Long Context (Gemini 1.5 Pro) Response")
     final_output = gr.HTML(label="Final (GPT-4o) Response")
-    def prepare_regulations(selected):
-        return [reg.split()[0] for reg in selected]
     load_button.click(
-        lambda selected, additional: load_pdfs(prepare_regulations(selected), additional),
-        inputs=[regulations_checkboxes, additional_pdfs],
         outputs=load_output
     )
@@ -217,4 +223,4 @@ with gr.Blocks() as iface:
         outputs=[rag_output, gemini_output, final_output]
     )
-iface.launch()

 import os
 import markdown2
+# Retrieve API keys from environment variables
 openai_api_key = os.getenv('OPENAI_API_KEY')
 groq_api_key = os.getenv('GROQ_API_KEY')
 google_api_key = os.getenv('GEMINI_API_KEY')
 rag_chain = None
 pdfs_loaded = False
 # Function to extract text from PDF
 def extract_pdf(pdf_path):
     try:
     chain = prompt | openai_client
     return chain.invoke({"response1": response1, "response2": response2}).content
+def markdown_to_html(content):
+    return markdown2.markdown(content)
+def load_pdfs(gdpr, ferpa, coppa, additional_pdfs):
+    global full_pdf_content, vector_store, rag_chain, pdfs_loaded
+    documents = []
+    full_pdf_content = ""
+    # Load selected regulation PDFs
+    selected_regulations = []
+    if gdpr:
+        selected_regulations.append("GDPR")
+    if ferpa:
+        selected_regulations.append("FERPA")
+    if coppa:
+        selected_regulations.append("COPPA")
+    for regulation in selected_regulations:
+        if regulation in regulation_pdfs:
+            pdf_path = regulation_pdfs[regulation]
+            if os.path.exists(pdf_path):
+                pdf_content = extract_pdf(pdf_path)
+                if pdf_content:
+                    full_pdf_content += pdf_content + "\n\n"
+                    documents.extend(split_text(pdf_content))
+                    print(f"Loaded {regulation} PDF")
+                else:
+                    print(f"Failed to extract content from {regulation} PDF")
+            else:
+                print(f"PDF file for {regulation} not found at {pdf_path}")
+    # Load additional user-uploaded PDFs
+    if additional_pdfs is not None:
+        for pdf_file in additional_pdfs:
+            pdf_content = extract_pdf(pdf_file.name)
+            if pdf_content:
+                full_pdf_content += pdf_content + "\n\n"
+                documents.extend(split_text(pdf_content))
+                print(f"Loaded additional PDF: {pdf_file.name}")
+            else:
+                print(f"Failed to extract content from uploaded PDF: {pdf_file.name}")
+    if not documents:
+        pdfs_loaded = False
+        return "No PDFs were successfully loaded. Please check your selections and uploads."
+    print(f"Total documents loaded: {len(documents)}")
+    print(f"Total content length: {len(full_pdf_content)} characters")
+    vector_store = generate_embeddings(documents)
+    rag_chain = create_rag_chain(vector_store)
+    pdfs_loaded = True
+    return f"PDFs loaded and RAG system updated successfully! Loaded {len(documents)} document chunks."
 def process_query(user_query):
     global rag_chain, full_pdf_content, pdfs_loaded
     return rag_response, gemini_resp, html_content
 # Gradio interface
 with gr.Blocks() as iface:
     gr.Markdown("# Data Protection Team")
     gr.Markdown("Get responses combining advanced RAG, Long Context, and SOTA models to data protection related questions.")
+    with gr.Row():
+        gdpr_checkbox = gr.Checkbox(label="GDPR (EU)")
+        ferpa_checkbox = gr.Checkbox(label="FERPA (US)")
+        coppa_checkbox = gr.Checkbox(label="COPPA (US <13)")
     gr.Markdown("**Optional: upload additional PDFs if needed (national regulation, school policy)**")
     additional_pdfs = gr.File(
     gemini_output = gr.Textbox(label="Long Context (Gemini 1.5 Pro) Response")
     final_output = gr.HTML(label="Final (GPT-4o) Response")
     load_button.click(
+        load_pdfs,
+        inputs=[
+            gdpr_checkbox,
+            ferpa_checkbox,
+            coppa_checkbox,
+            additional_pdfs
+        ],
         outputs=load_output
     )
         outputs=[rag_output, gemini_output, final_output]
     )
+iface.launch()