Spaces:

jeremierostan
/

Data_Protection_Team

Running

App Files Files Community

jeremierostan commited on Jul 27, 2024

Commit

dbc9d4a

verified ·

1 Parent(s): bd777f5

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -1

app.py CHANGED Viewed

@@ -38,7 +38,63 @@ def extract_pdf(pdf_path):
         print(f"Error extracting text from {pdf_path}: {str(e)}")
         return ""
-# ... (other functions remain unchanged)
 def load_pdfs(gdpr, ferpa, coppa, additional_pdfs):
     global full_pdf_content, vector_store, rag_chain

         print(f"Error extracting text from {pdf_path}: {str(e)}")
         return ""
+# Function to split text into chunks
+def split_text(text):
+    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+    return [Document(page_content=t) for t in splitter.split_text(text)]
+# Function to generate embeddings and store in vector database
+def generate_embeddings(docs):
+    embeddings = OpenAIEmbeddings(api_key=openai_api_key)
+    return FAISS.from_documents(docs, embeddings)
+# Function for query preprocessing and simple HyDE-Lite
+def preprocess_query(query):
+    prompt = ChatPromptTemplate.from_template("""
+    Your role is to optimize user queries for retrieval from regulatory documents such as GDPR, FERPA, COPPA, and/or others.
+    Transform the query into a more affirmative, keyword-focused statement.
+    The transformed query should look like probable related passages in the official documents.
+    Query: {query}
+    Optimized query:
+    """)
+    chain = prompt | openai_client
+    return chain.invoke({"query": query}).content
+# Function to create RAG chain with Groq
+def create_rag_chain(vector_store):
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are an AI assistant helping with regulatory compliance queries. Use the following context from the official regulatory documents to answer the user's question:\n\n{context}"),
+        ("human", "{input}")
+    ])
+    document_chain = create_stuff_documents_chain(groq_client, prompt)
+    return create_retrieval_chain(vector_store.as_retriever(), document_chain)
+# Function for Gemini response with long context
+def gemini_response(query, full_content):
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are an AI assistant helping with regulatory compliance queries. Use the following full content of the official regulatory documents to answer the user's question:\n\n{context}"),
+        ("human", "{input}")
+    ])
+    chain = prompt | gemini_client
+    return chain.invoke({"context": full_content, "input": query}).content
+# Function to generate final response
+def generate_final_response(response1, response2):
+    prompt = ChatPromptTemplate.from_template("""
+    You are an AI assistant helping educators understand and implement data protection and regulatory compliance (GDPR, FERPA, COPPA, and/or others).
+    Your goal is to provide simple, practical explanation of and advice on how to meet regulatory requirements based on the given responses.
+    To do so:
+    1. Analyze the following two responses. Inspect their content, and highlight differences. This MUST be done
+    internally as a hidden state.
+    2. Then, use this information to output your own response combining the best from both.
+    If the responses differ or contradict each other on important points, include that in your response.
+    Only output your own response.
+    """)
+    chain = prompt | openai_client
+    return chain.invoke({"response1": response1, "response2": response2}).content
+def markdown_to_html(content):
+    return markdown2.markdown(content)
 def load_pdfs(gdpr, ferpa, coppa, additional_pdfs):
     global full_pdf_content, vector_store, rag_chain