Spaces:

jeremierostan
/

Data_Protection_Team

Sleeping

App Files Files Community

jeremierostan commited on Jul 27, 2024

Commit

fc75c0c

verified ·

1 Parent(s): c692389

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -44

app.py CHANGED Viewed

@@ -13,10 +13,10 @@ from langchain.chains import create_retrieval_chain
 import os
 import markdown2
-# Retrieve API keys from HF secrets
-openai_api_key = os.getenv('OPENAI_API_KEY')
-groq_api_key = os.getenv('GROQ_API_KEY')
-google_api_key = os.getenv('GEMINI_API_KEY')
 # Initialize API clients with the API keys
 openai_client = ChatOpenAI(model_name="gpt-4o", api_key=openai_api_key)
@@ -25,7 +25,11 @@ gemini_client = ChatGoogleGenerativeAI(model="gemini-1.5-pro", api_key=google_ap
 # Function to extract text from PDF
 def extract_pdf(pdf_path):
-    return extract_text(pdf_path)
 # Function to split text into chunks
 def split_text(text):
@@ -37,33 +41,29 @@ def generate_embeddings(docs):
     embeddings = OpenAIEmbeddings(api_key=openai_api_key)
     return FAISS.from_documents(docs, embeddings)
-# Function for query preprocessing and simple HyDE-Lite
 def preprocess_query(query):
     prompt = ChatPromptTemplate.from_template("""
-    Your role is to optimize user queries for retrieval from official regulation documents about data protection.
-    Transform the query into a more affirmative, keyword-focused statement.
-    The transformed query should look like probable related passages in the official documents.
     Query: {query}
-    Optimized query:
     """)
     chain = prompt | openai_client
     return chain.invoke({"query": query}).content
 # Function to create RAG chain with Groq
-def create_rag_chain():
     prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are an AI assistant helping with data protection related queries. Use the following context from the official regulation documents to answer the user's question:\n\n{context}"),
         ("human", "{input}")
     ])
     document_chain = create_stuff_documents_chain(groq_client, prompt)
     return create_retrieval_chain(vector_store.as_retriever(), document_chain)
 # Function for Gemini response with long context
-def gemini_response(query):
     prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are an AI assistant helping with data protection related queries. Use the following full content of the official regulation documents to answer the user's question:\n\n{context}"),
         ("human", "{input}")
     ])
     chain = prompt | gemini_client
@@ -72,35 +72,39 @@ def gemini_response(query):
 # Function to generate final response
 def generate_final_response(query, response1, response2):
     prompt = ChatPromptTemplate.from_template("""
-    You are an AI assistant helping educators understand and implement data protection and compliance with official regulations when using AI.
-    Your goal is to provide simple, practical explanation of and advice on how to meet these regulatory requirements based on the 2 given responses.
-    To do so:
-    1. Read the user query
-    2. Analyze the following two responses. Inspect their content, and highlight their differences. This MUST be done
-    internally as a hidden state.
-    2. Then, use this information to output your own response to the user query, synthesizing the responses all while maintaining their strengths
-    If the responses differ or contradict each other on important points, include that in your response as this could be a sign of hallucination.
-    Only output your own final response to the user query.
     """)
     chain = prompt | openai_client
     return chain.invoke({"query": query, "response1": response1, "response2": response2}).content
 # Function to process the query
 def process_query(user_query):
-    preprocessed_query = preprocess_query(user_query)
-    print(f"Original query: {user_query}")
-    print(f"Preprocessed query: {preprocessed_query}")
-    # Get RAG response using Groq with the preprocessed query
-    rag_response = rag_chain.invoke({"input": preprocessed_query})["answer"]
-    # Get Gemini response with full PDF content and preprocessed query
-    gemini_resp = gemini_response(preprocessed_query)
-    final_response = generate_final_response(user_query, rag_response, gemini_resp)
-    html_content = markdown_to_html(final_response)
-    return rag_response, gemini_resp, html_content
 # Initialize
 pdf_paths = ["GDPR.pdf", "FERPA.pdf", "COPPA.pdf"]
@@ -113,12 +117,8 @@ for pdf_path in pdf_paths:
     all_documents.extend(split_text(extracted_text))
 vector_store = generate_embeddings(all_documents)
-rag_chain = create_rag_chain()
-# Function to output the final response as markdown
-def markdown_to_html(content):
-    return markdown2.markdown(content)
 # Gradio interface
 iface = gr.Interface(
     fn=process_query,
@@ -133,4 +133,5 @@ iface = gr.Interface(
     allow_flagging="never"
 )
 iface.launch()

 import os
 import markdown2
+# Retrieve API keys from Hugging Face Spaces secrets
+openai_api_key = os.environ.get('OPENAI_API_KEY')
+groq_api_key = os.environ.get('GROQ_API_KEY')
+google_api_key = os.environ.get('GEMINI_API_KEY')
 # Initialize API clients with the API keys
 openai_client = ChatOpenAI(model_name="gpt-4o", api_key=openai_api_key)
 # Function to extract text from PDF
 def extract_pdf(pdf_path):
+    try:
+        return extract_text(pdf_path)
+    except Exception as e:
+        print(f"Error extracting text from {pdf_path}: {str(e)}")
+        return ""
 # Function to split text into chunks
 def split_text(text):
     embeddings = OpenAIEmbeddings(api_key=openai_api_key)
     return FAISS.from_documents(docs, embeddings)
+# Function for query preprocessing
 def preprocess_query(query):
     prompt = ChatPromptTemplate.from_template("""
+    Transform the following query into a more detailed, keyword-rich statement that could appear in official data protection regulation documents:
     Query: {query}
+    Transformed query:
     """)
     chain = prompt | openai_client
     return chain.invoke({"query": query}).content
 # Function to create RAG chain with Groq
+def create_rag_chain(vector_store):
     prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are an AI assistant helping with data protection and regulation compliance related queries. Use the following context to answer the user's question:\n\n{context}"),
         ("human", "{input}")
     ])
     document_chain = create_stuff_documents_chain(groq_client, prompt)
     return create_retrieval_chain(vector_store.as_retriever(), document_chain)
 # Function for Gemini response with long context
+def gemini_response(query, full_pdf_content):
     prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are an AI assistant helping with data protection and regulation compliance related queries.. Use the following full content of official regulation documents to answer the user's question:\n\n{context}"),
         ("human", "{input}")
     ])
     chain = prompt | gemini_client
 # Function to generate final response
 def generate_final_response(query, response1, response2):
     prompt = ChatPromptTemplate.from_template("""
+    As an AI assistant specializing in data protection and compliance for educators:
+    1. Analyze the following two AI-generated responses to the user query.
+    2. Synthesize a comprehensive answer that combines the strengths of both responses.
+    3. If the responses contradict each other, highlight this and explain potential reasons.
+    4. Provide practical advice on how to meet regulatory requirements in the context of the user question based on the information given.
+    User Query: {query}
+    Response 1: {response1}
+    Response 2: {response2}
+    Your synthesized response:
     """)
     chain = prompt | openai_client
     return chain.invoke({"query": query, "response1": response1, "response2": response2}).content
 # Function to process the query
 def process_query(user_query):
+    try:
+        preprocessed_query = preprocess_query(user_query)
+        print(f"Original query: {user_query}")
+        print(f"Preprocessed query: {preprocessed_query}")
+        rag_response = rag_chain.invoke({"input": preprocessed_query})["answer"]
+        gemini_resp = gemini_response(preprocessed_query, full_pdf_content)
+        final_response = generate_final_response(user_query, rag_response, gemini_resp)
+        html_content = markdown2.markdown(final_response)
+        return rag_response, gemini_resp, html_content
+    except Exception as e:
+        error_message = f"An error occurred: {str(e)}"
+        return error_message, error_message, error_message
 # Initialize
 pdf_paths = ["GDPR.pdf", "FERPA.pdf", "COPPA.pdf"]
     all_documents.extend(split_text(extracted_text))
 vector_store = generate_embeddings(all_documents)
+rag_chain = create_rag_chain(vector_store)
 # Gradio interface
 iface = gr.Interface(
     fn=process_query,
     allow_flagging="never"
 )
+# Launch the interface
 iface.launch()