jeremierostan commited on
Commit
f7cfaf7
·
verified ·
1 Parent(s): 6c30aba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -63
app.py CHANGED
@@ -13,7 +13,7 @@ from langchain.chains import create_retrieval_chain
13
  import os
14
  import markdown2
15
 
16
- # Retrieve API keys from HF secrets
17
  openai_api_key = os.getenv('OPENAI_API_KEY')
18
  groq_api_key = os.getenv('GROQ_API_KEY')
19
  google_api_key = os.getenv('GEMINI_API_KEY')
@@ -36,53 +36,6 @@ vector_store = None
36
  rag_chain = None
37
  pdfs_loaded = False
38
 
39
- # Function to load regulations with checked boxes or uploaded
40
- def load_pdfs(selected_regulations, additional_pdfs):
41
- global full_pdf_content, vector_store, rag_chain, pdfs_loaded
42
-
43
- documents = []
44
- full_pdf_content = ""
45
-
46
- print(f"Selected regulations: {selected_regulations}") # Debug print
47
-
48
- for regulation in selected_regulations:
49
- if regulation in regulation_pdfs:
50
- pdf_path = regulation_pdfs[regulation]
51
- if os.path.exists(pdf_path):
52
- pdf_content = extract_pdf(pdf_path)
53
- if pdf_content:
54
- full_pdf_content += pdf_content + "\n\n"
55
- documents.extend(split_text(pdf_content))
56
- print(f"Loaded {regulation} PDF")
57
- else:
58
- print(f"Failed to extract content from {regulation} PDF")
59
- else:
60
- print(f"PDF file for {regulation} not found at {pdf_path}")
61
-
62
- # Load additional user-uploaded PDFs
63
- if additional_pdfs is not None:
64
- for pdf_file in additional_pdfs:
65
- pdf_content = extract_pdf(pdf_file.name)
66
- if pdf_content:
67
- full_pdf_content += pdf_content + "\n\n"
68
- documents.extend(split_text(pdf_content))
69
- print(f"Loaded additional PDF: {pdf_file.name}")
70
- else:
71
- print(f"Failed to extract content from uploaded PDF: {pdf_file.name}")
72
-
73
- if not documents:
74
- pdfs_loaded = False
75
- return "No PDFs were successfully loaded. Please check your selections and uploads."
76
-
77
- print(f"Total documents loaded: {len(documents)}")
78
- print(f"Total content length: {len(full_pdf_content)} characters")
79
-
80
- vector_store = generate_embeddings(documents)
81
- rag_chain = create_rag_chain(vector_store)
82
-
83
- pdfs_loaded = True
84
- return f"PDFs loaded and RAG system updated successfully! Loaded {len(documents)} document chunks."
85
-
86
  # Function to extract text from PDF
87
  def extract_pdf(pdf_path):
88
  try:
@@ -146,7 +99,62 @@ def generate_final_response(response1, response2):
146
  chain = prompt | openai_client
147
  return chain.invoke({"response1": response1, "response2": response2}).content
148
 
149
- # Function to process the query
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  def process_query(user_query):
151
  global rag_chain, full_pdf_content, pdfs_loaded
152
 
@@ -168,19 +176,15 @@ def process_query(user_query):
168
 
169
  return rag_response, gemini_resp, html_content
170
 
171
- # Function to output the final response as markdown
172
- def markdown_to_html(content):
173
- return markdown2.markdown(content)
174
-
175
  # Gradio interface
176
  with gr.Blocks() as iface:
177
  gr.Markdown("# Data Protection Team")
178
  gr.Markdown("Get responses combining advanced RAG, Long Context, and SOTA models to data protection related questions.")
179
 
180
- regulations_checkboxes = gr.CheckboxGroup(
181
- choices=["GDPR (EU)", "FERPA (US)", "COPPA (US <13)"],
182
- label="Select Regulations"
183
- )
184
 
185
  gr.Markdown("**Optional: upload additional PDFs if needed (national regulation, school policy)**")
186
  additional_pdfs = gr.File(
@@ -202,12 +206,14 @@ with gr.Blocks() as iface:
202
  gemini_output = gr.Textbox(label="Long Context (Gemini 1.5 Pro) Response")
203
  final_output = gr.HTML(label="Final (GPT-4o) Response")
204
 
205
- def prepare_regulations(selected):
206
- return [reg.split()[0] for reg in selected]
207
-
208
  load_button.click(
209
- lambda selected, additional: load_pdfs(prepare_regulations(selected), additional),
210
- inputs=[regulations_checkboxes, additional_pdfs],
 
 
 
 
 
211
  outputs=load_output
212
  )
213
 
@@ -217,4 +223,4 @@ with gr.Blocks() as iface:
217
  outputs=[rag_output, gemini_output, final_output]
218
  )
219
 
220
- iface.launch()
 
13
  import os
14
  import markdown2
15
 
16
+ # Retrieve API keys from environment variables
17
  openai_api_key = os.getenv('OPENAI_API_KEY')
18
  groq_api_key = os.getenv('GROQ_API_KEY')
19
  google_api_key = os.getenv('GEMINI_API_KEY')
 
36
  rag_chain = None
37
  pdfs_loaded = False
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  # Function to extract text from PDF
40
  def extract_pdf(pdf_path):
41
  try:
 
99
  chain = prompt | openai_client
100
  return chain.invoke({"response1": response1, "response2": response2}).content
101
 
102
+ def markdown_to_html(content):
103
+ return markdown2.markdown(content)
104
+
105
+ def load_pdfs(gdpr, ferpa, coppa, additional_pdfs):
106
+ global full_pdf_content, vector_store, rag_chain, pdfs_loaded
107
+
108
+ documents = []
109
+ full_pdf_content = ""
110
+
111
+ # Load selected regulation PDFs
112
+ selected_regulations = []
113
+ if gdpr:
114
+ selected_regulations.append("GDPR")
115
+ if ferpa:
116
+ selected_regulations.append("FERPA")
117
+ if coppa:
118
+ selected_regulations.append("COPPA")
119
+
120
+ for regulation in selected_regulations:
121
+ if regulation in regulation_pdfs:
122
+ pdf_path = regulation_pdfs[regulation]
123
+ if os.path.exists(pdf_path):
124
+ pdf_content = extract_pdf(pdf_path)
125
+ if pdf_content:
126
+ full_pdf_content += pdf_content + "\n\n"
127
+ documents.extend(split_text(pdf_content))
128
+ print(f"Loaded {regulation} PDF")
129
+ else:
130
+ print(f"Failed to extract content from {regulation} PDF")
131
+ else:
132
+ print(f"PDF file for {regulation} not found at {pdf_path}")
133
+
134
+ # Load additional user-uploaded PDFs
135
+ if additional_pdfs is not None:
136
+ for pdf_file in additional_pdfs:
137
+ pdf_content = extract_pdf(pdf_file.name)
138
+ if pdf_content:
139
+ full_pdf_content += pdf_content + "\n\n"
140
+ documents.extend(split_text(pdf_content))
141
+ print(f"Loaded additional PDF: {pdf_file.name}")
142
+ else:
143
+ print(f"Failed to extract content from uploaded PDF: {pdf_file.name}")
144
+
145
+ if not documents:
146
+ pdfs_loaded = False
147
+ return "No PDFs were successfully loaded. Please check your selections and uploads."
148
+
149
+ print(f"Total documents loaded: {len(documents)}")
150
+ print(f"Total content length: {len(full_pdf_content)} characters")
151
+
152
+ vector_store = generate_embeddings(documents)
153
+ rag_chain = create_rag_chain(vector_store)
154
+
155
+ pdfs_loaded = True
156
+ return f"PDFs loaded and RAG system updated successfully! Loaded {len(documents)} document chunks."
157
+
158
  def process_query(user_query):
159
  global rag_chain, full_pdf_content, pdfs_loaded
160
 
 
176
 
177
  return rag_response, gemini_resp, html_content
178
 
 
 
 
 
179
  # Gradio interface
180
  with gr.Blocks() as iface:
181
  gr.Markdown("# Data Protection Team")
182
  gr.Markdown("Get responses combining advanced RAG, Long Context, and SOTA models to data protection related questions.")
183
 
184
+ with gr.Row():
185
+ gdpr_checkbox = gr.Checkbox(label="GDPR (EU)")
186
+ ferpa_checkbox = gr.Checkbox(label="FERPA (US)")
187
+ coppa_checkbox = gr.Checkbox(label="COPPA (US <13)")
188
 
189
  gr.Markdown("**Optional: upload additional PDFs if needed (national regulation, school policy)**")
190
  additional_pdfs = gr.File(
 
206
  gemini_output = gr.Textbox(label="Long Context (Gemini 1.5 Pro) Response")
207
  final_output = gr.HTML(label="Final (GPT-4o) Response")
208
 
 
 
 
209
  load_button.click(
210
+ load_pdfs,
211
+ inputs=[
212
+ gdpr_checkbox,
213
+ ferpa_checkbox,
214
+ coppa_checkbox,
215
+ additional_pdfs
216
+ ],
217
  outputs=load_output
218
  )
219
 
 
223
  outputs=[rag_output, gemini_output, final_output]
224
  )
225
 
226
+ iface.launch()