Spaces:
Sleeping
Sleeping
File size: 5,778 Bytes
59b3a37 fc75c0c 59b3a37 fc75c0c 59b3a37 dbc9d4a fc75c0c dbc9d4a b7611a0 dbc9d4a fc75c0c dbc9d4a fc75c0c dbc9d4a fc75c0c dbc9d4a fc75c0c dbc9d4a fc75c0c dbc9d4a af1dd95 dbc9d4a 18801f2 dbc9d4a fc75c0c 366119a fc75c0c dbc9d4a 18801f2 dbc9d4a af1dd95 59b3a37 fc75c0c 59b3a37 af1dd95 fc75c0c af1dd95 fa32c1b af1dd95 59b3a37 fc75c0c f7cfaf7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import gradio as gr
from pdfminer.high_level import extract_text
from langchain_groq import ChatGroq
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import os
import markdown2
# Retrieve API keys from Hugging Face Spaces secrets
openai_api_key = os.environ.get('OPENAI_API_KEY')
groq_api_key = os.environ.get('GROQ_API_KEY')
google_api_key = os.environ.get('GEMINI_API_KEY')
# Initialize API clients with the API keys
openai_client = ChatOpenAI(model_name="gpt-4o", api_key=openai_api_key)
groq_client = ChatGroq(model="llama-3.1-70b-versatile", temperature=0, api_key=groq_api_key)
gemini_client = ChatGoogleGenerativeAI(model="gemini-1.5-pro", api_key=google_api_key)
# Function to extract text from PDF
def extract_pdf(pdf_path):
try:
return extract_text(pdf_path)
except Exception as e:
print(f"Error extracting text from {pdf_path}: {str(e)}")
return ""
# Function to split text into chunks
def split_text(text):
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
return [Document(page_content=t) for t in splitter.split_text(text)]
# Function to generate embeddings and store in vector database
def generate_embeddings(docs):
embeddings = OpenAIEmbeddings(api_key=openai_api_key)
return FAISS.from_documents(docs, embeddings)
# Function for query preprocessing
def preprocess_query(query):
prompt = ChatPromptTemplate.from_template("""
Transform the following query into a more detailed, keyword-rich affitmative statement that could appear in official data protection regulation documents:
Query: {query}
Transformed query:
""")
chain = prompt | openai_client
return chain.invoke({"query": query}).content
# Function to create RAG chain with Groq
def create_rag_chain(vector_store):
prompt = ChatPromptTemplate.from_messages([
("system", "You are an AI assistant helping with data protection and regulation compliance related queries. Use the following context to answer the user's question:\n\n{context}"),
("human", "{input}")
])
document_chain = create_stuff_documents_chain(groq_client, prompt)
return create_retrieval_chain(vector_store.as_retriever(), document_chain)
# Function for Gemini response with long context
def gemini_response(query, full_pdf_content):
prompt = ChatPromptTemplate.from_messages([
("system", "You are an AI assistant helping with data protection and regulation compliance related queries.. Use the following full content of official regulation documents to answer the user's question:\n\n{context}"),
("human", "{input}")
])
chain = prompt | gemini_client
return chain.invoke({"context": full_pdf_content, "input": query}).content
# Function to generate final response
def generate_final_response(query, response1, response2):
prompt = ChatPromptTemplate.from_template("""
As an AI assistant specializing in data protection and compliance for educators:
[hidden states, scrartchpad]
1. Analyze for yourself the following two AI-generated responses to the user query.
2. Think of a comprehensive answer that combines the strengths of both responses.
3. If the responses contradict each other, highlight this and if it might indicate a hallucination.
[Output]
4. Provide practical advice on how to meet regulatory requirements in the context of the user question based on the information given.
User Query: {query}
Response 1: {response1}
Response 2: {response2}
Your synthesized response:
""")
chain = prompt | openai_client
return chain.invoke({"query": query, "response1": response1, "response2": response2}).content
# Function to process the query
def process_query(user_query):
try:
preprocessed_query = preprocess_query(user_query)
print(f"Original query: {user_query}")
print(f"Preprocessed query: {preprocessed_query}")
rag_response = rag_chain.invoke({"input": preprocessed_query})["answer"]
gemini_resp = gemini_response(preprocessed_query, full_pdf_content)
final_response = generate_final_response(user_query, rag_response, gemini_resp)
return rag_response, gemini_resp, html_content
except Exception as e:
error_message = f"An error occurred: {str(e)}"
return error_message, error_message, error_message
# Initialize
pdf_paths = ["GDPR.pdf", "FERPA.pdf", "COPPA.pdf"]
full_pdf_content = ""
all_documents = []
for pdf_path in pdf_paths:
extracted_text = extract_pdf(pdf_path)
full_pdf_content += extracted_text + "\n\n"
all_documents.extend(split_text(extracted_text))
vector_store = generate_embeddings(all_documents)
rag_chain = create_rag_chain(vector_store)
# Gradio interface
iface = gr.Interface(
fn=process_query,
inputs=gr.Textbox(label="Ask your data protection related question"),
outputs=[
gr.Textbox(label="RAG Pipeline (Llama3.1) Response"),
gr.Textbox(label="Long Context (Gemini 1.5 Pro) Response"),
gr.Textbox(label="Final (GPT-4o) Response")
],
title="Data Protection Team",
description="Get responses combining advanced RAG, Long Context, and SOTA models to data protection related questions (GDPR, FERPA, COPPA).",
allow_flagging="never"
)
# Launch the interface
iface.launch()
|