Spaces:

abdullahzunorain
/

Simple-RAG-App-Test

Runtime error

App Files Files Community

Simple-RAG-App-Test / app.py

abdullahzunorain

Update app.py

589047e verified 10 months ago

raw

history blame

5.81 kB

	import fitz # PyMuPDF
	import faiss
	import os
	from sentence_transformers import SentenceTransformer
	import streamlit as st
	from groq import Groq # Import Groq client library

	# Initialize the Groq API client
	groq_api_key = os.getenv("groq_api") # Set your Groq API key as an environment variable
	client = Groq(api_key=groq_api_key)

	# Initialize sentence transformer model and vector store
	embedder = SentenceTransformer('all-MiniLM-L6-v2')
	dimension = 384 # Dimension of embeddings in all-MiniLM-L6-v2
	index = faiss.IndexFlatL2(dimension)

	# PDF processing function
	def extract_text_from_pdf(pdf_file):
	text = ""
	with fitz.open(pdf_file) as pdf:
	for page in pdf:
	text += page.get_text()
	return text

	# Split text into chunks for embedding
	def split_text(text, chunk_size=512):
	words = text.split()
	return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

	# Embed and add chunks to FAISS index
	def embed_and_store_chunks(chunks):
	embeddings = embedder.encode(chunks)
	index.add(embeddings)
	return embeddings

	# Retrieve the most relevant chunks
	def retrieve_chunks(question, top_k=3):
	question_embedding = embedder.encode([question])
	distances, indices = index.search(question_embedding, top_k)
	retrieved_chunks = [chunks[idx] for idx in indices[0]]
	return " ".join(retrieved_chunks)

	# Generate answer using Groq API
	def generate_answer(question, context):
	prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
	response = client.generate(prompt=prompt, max_tokens=100, temperature=0.7)
	return response["choices"][0]["text"].strip()

	# Streamlit app
	st.title("PDF Question-Answer Chatbot with RAG using Groq API")

	# File uploader
	uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")

	if uploaded_file is not None:
	# Extract text from the PDF file
	pdf_text = extract_text_from_pdf(uploaded_file)
	# Split the text and embed/store chunks in FAISS
	chunks = split_text(pdf_text)
	embed_and_store_chunks(chunks)
	st.success("PDF processed and knowledge base created!")

	# User question input
	question = st.text_input("Ask a question about the PDF content:")
	if question:
	# Retrieve relevant context and generate answer
	context = retrieve_chunks(question)
	answer = generate_answer(question, context)
	st.write("Answer:", answer)









	# import os
	# import streamlit as st
	# from sentence_transformers import SentenceTransformer, util
	# from groq import Groq
	# from PyPDF2 import PdfReader



	# # Initialize the retriever and Groq client
	# retriever = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
	# # client = Groq(api_key=groq_api) # Replace with your actual Groq API key
	# key = os.getenv("groq_api")
	# client = Groq(api_key = key)

	# # Knowledge base (documents) and embeddings
	# documents = [
	# "Retrieval-Augmented Generation (RAG) is an AI framework that combines the strengths of retrieval-based and generative models.",
	# "The main components of a RAG system are the retriever and the generator.",
	# "A key benefit of Retrieval-Augmented Generation is that it can produce more accurate responses compared to standalone generative models.",
	# "The retrieval process in a RAG system often relies on embedding-based models, like Sentence-BERT or DPR.",
	# "Common use cases of RAG include chatbots, customer support systems, and knowledge retrieval for business intelligence."
	# ]
	# document_embeddings = retriever.encode(documents, convert_to_tensor=True)

	# # Function to retrieve top relevant document and truncate context if too long
	# def retrieve(query, top_k=1, max_tokens=100):
	# query_embedding = retriever.encode(query, convert_to_tensor=True)
	# hits = util.semantic_search(query_embedding, document_embeddings, top_k=top_k)
	# top_docs = [documents[hit['corpus_id']] for hit in hits[0]]

	# # Truncate context to max_tokens if necessary
	# context = top_docs[0] if hits[0] else ""
	# context = ' '.join(context.split()[:max_tokens]) # Limit to max_tokens words
	# return context

	# # Function to generate response using Groq
	# def generate_response(query, context):
	# response = client.chat.completions.create(
	# messages=[
	# {
	# "role": "user",
	# "content": f"Context: {context} Question: {query} Answer:"
	# }
	# ],
	# model="gemma2-9b-it"
	# )
	# return response.choices[0].message.content

	# # Function to handle PDF upload and text extraction
	# def extract_text_from_pdf(file):
	# pdf_reader = PdfReader(file)
	# text = ""
	# for page in pdf_reader.pages:
	# text += page.extract_text()
	# return text

	# # Function to update knowledge base with new content from PDF
	# def update_knowledge_base(pdf_text):
	# global documents, document_embeddings
	# documents.append(pdf_text)
	# document_embeddings = retriever.encode(documents, convert_to_tensor=True)

	# # Streamlit app layout
	# st.title("RAG-based Question Answering App")
	# st.write("Upload a PDF, ask questions based on its content, and get answers!")

	# # Upload PDF file
	# uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
	# if uploaded_file:
	# pdf_text = extract_text_from_pdf(uploaded_file)
	# update_knowledge_base(pdf_text)
	# st.write("PDF content successfully added to the knowledge base.")

	# # Question input
	# question = st.text_input("Enter your question:")
	# if question:
	# retrieved_context = retrieve(question)
	# if retrieved_context:
	# answer = generate_response(question, retrieved_context)
	# else:
	# answer = "I have no knowledge about this topic."
	# st.write("Answer:", answer)