jeremierostan commited on
Commit
dbc9d4a
·
verified ·
1 Parent(s): bd777f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -1
app.py CHANGED
@@ -38,7 +38,63 @@ def extract_pdf(pdf_path):
38
  print(f"Error extracting text from {pdf_path}: {str(e)}")
39
  return ""
40
 
41
- # ... (other functions remain unchanged)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def load_pdfs(gdpr, ferpa, coppa, additional_pdfs):
44
  global full_pdf_content, vector_store, rag_chain
 
38
  print(f"Error extracting text from {pdf_path}: {str(e)}")
39
  return ""
40
 
41
+ # Function to split text into chunks
42
+ def split_text(text):
43
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
44
+ return [Document(page_content=t) for t in splitter.split_text(text)]
45
+
46
+ # Function to generate embeddings and store in vector database
47
+ def generate_embeddings(docs):
48
+ embeddings = OpenAIEmbeddings(api_key=openai_api_key)
49
+ return FAISS.from_documents(docs, embeddings)
50
+
51
+ # Function for query preprocessing and simple HyDE-Lite
52
+ def preprocess_query(query):
53
+ prompt = ChatPromptTemplate.from_template("""
54
+ Your role is to optimize user queries for retrieval from regulatory documents such as GDPR, FERPA, COPPA, and/or others.
55
+ Transform the query into a more affirmative, keyword-focused statement.
56
+ The transformed query should look like probable related passages in the official documents.
57
+ Query: {query}
58
+ Optimized query:
59
+ """)
60
+ chain = prompt | openai_client
61
+ return chain.invoke({"query": query}).content
62
+
63
+ # Function to create RAG chain with Groq
64
+ def create_rag_chain(vector_store):
65
+ prompt = ChatPromptTemplate.from_messages([
66
+ ("system", "You are an AI assistant helping with regulatory compliance queries. Use the following context from the official regulatory documents to answer the user's question:\n\n{context}"),
67
+ ("human", "{input}")
68
+ ])
69
+ document_chain = create_stuff_documents_chain(groq_client, prompt)
70
+ return create_retrieval_chain(vector_store.as_retriever(), document_chain)
71
+
72
+ # Function for Gemini response with long context
73
+ def gemini_response(query, full_content):
74
+ prompt = ChatPromptTemplate.from_messages([
75
+ ("system", "You are an AI assistant helping with regulatory compliance queries. Use the following full content of the official regulatory documents to answer the user's question:\n\n{context}"),
76
+ ("human", "{input}")
77
+ ])
78
+ chain = prompt | gemini_client
79
+ return chain.invoke({"context": full_content, "input": query}).content
80
+
81
+ # Function to generate final response
82
+ def generate_final_response(response1, response2):
83
+ prompt = ChatPromptTemplate.from_template("""
84
+ You are an AI assistant helping educators understand and implement data protection and regulatory compliance (GDPR, FERPA, COPPA, and/or others).
85
+ Your goal is to provide simple, practical explanation of and advice on how to meet regulatory requirements based on the given responses.
86
+ To do so:
87
+ 1. Analyze the following two responses. Inspect their content, and highlight differences. This MUST be done
88
+ internally as a hidden state.
89
+ 2. Then, use this information to output your own response combining the best from both.
90
+ If the responses differ or contradict each other on important points, include that in your response.
91
+ Only output your own response.
92
+ """)
93
+ chain = prompt | openai_client
94
+ return chain.invoke({"response1": response1, "response2": response2}).content
95
+
96
+ def markdown_to_html(content):
97
+ return markdown2.markdown(content)
98
 
99
  def load_pdfs(gdpr, ferpa, coppa, additional_pdfs):
100
  global full_pdf_content, vector_store, rag_chain