Spaces:
Runtime error
Runtime error
File size: 5,805 Bytes
589047e e625c0b 589047e e625c0b 589047e e625c0b 589047e e625c0b 589047e e625c0b 589047e e625c0b 589047e e625c0b 589047e e625c0b 589047e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import fitz # PyMuPDF
import faiss
import os
from sentence_transformers import SentenceTransformer
import streamlit as st
from groq import Groq # Import Groq client library
# Initialize the Groq API client
groq_api_key = os.getenv("groq_api") # Set your Groq API key as an environment variable
client = Groq(api_key=groq_api_key)
# Initialize sentence transformer model and vector store
embedder = SentenceTransformer('all-MiniLM-L6-v2')
dimension = 384 # Dimension of embeddings in all-MiniLM-L6-v2
index = faiss.IndexFlatL2(dimension)
# PDF processing function
def extract_text_from_pdf(pdf_file):
text = ""
with fitz.open(pdf_file) as pdf:
for page in pdf:
text += page.get_text()
return text
# Split text into chunks for embedding
def split_text(text, chunk_size=512):
words = text.split()
return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
# Embed and add chunks to FAISS index
def embed_and_store_chunks(chunks):
embeddings = embedder.encode(chunks)
index.add(embeddings)
return embeddings
# Retrieve the most relevant chunks
def retrieve_chunks(question, top_k=3):
question_embedding = embedder.encode([question])
distances, indices = index.search(question_embedding, top_k)
retrieved_chunks = [chunks[idx] for idx in indices[0]]
return " ".join(retrieved_chunks)
# Generate answer using Groq API
def generate_answer(question, context):
prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
response = client.generate(prompt=prompt, max_tokens=100, temperature=0.7)
return response["choices"][0]["text"].strip()
# Streamlit app
st.title("PDF Question-Answer Chatbot with RAG using Groq API")
# File uploader
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
if uploaded_file is not None:
# Extract text from the PDF file
pdf_text = extract_text_from_pdf(uploaded_file)
# Split the text and embed/store chunks in FAISS
chunks = split_text(pdf_text)
embed_and_store_chunks(chunks)
st.success("PDF processed and knowledge base created!")
# User question input
question = st.text_input("Ask a question about the PDF content:")
if question:
# Retrieve relevant context and generate answer
context = retrieve_chunks(question)
answer = generate_answer(question, context)
st.write("Answer:", answer)
# import os
# import streamlit as st
# from sentence_transformers import SentenceTransformer, util
# from groq import Groq
# from PyPDF2 import PdfReader
# # Initialize the retriever and Groq client
# retriever = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# # client = Groq(api_key=groq_api) # Replace with your actual Groq API key
# key = os.getenv("groq_api")
# client = Groq(api_key = key)
# # Knowledge base (documents) and embeddings
# documents = [
# "Retrieval-Augmented Generation (RAG) is an AI framework that combines the strengths of retrieval-based and generative models.",
# "The main components of a RAG system are the retriever and the generator.",
# "A key benefit of Retrieval-Augmented Generation is that it can produce more accurate responses compared to standalone generative models.",
# "The retrieval process in a RAG system often relies on embedding-based models, like Sentence-BERT or DPR.",
# "Common use cases of RAG include chatbots, customer support systems, and knowledge retrieval for business intelligence."
# ]
# document_embeddings = retriever.encode(documents, convert_to_tensor=True)
# # Function to retrieve top relevant document and truncate context if too long
# def retrieve(query, top_k=1, max_tokens=100):
# query_embedding = retriever.encode(query, convert_to_tensor=True)
# hits = util.semantic_search(query_embedding, document_embeddings, top_k=top_k)
# top_docs = [documents[hit['corpus_id']] for hit in hits[0]]
# # Truncate context to max_tokens if necessary
# context = top_docs[0] if hits[0] else ""
# context = ' '.join(context.split()[:max_tokens]) # Limit to max_tokens words
# return context
# # Function to generate response using Groq
# def generate_response(query, context):
# response = client.chat.completions.create(
# messages=[
# {
# "role": "user",
# "content": f"Context: {context} Question: {query} Answer:"
# }
# ],
# model="gemma2-9b-it"
# )
# return response.choices[0].message.content
# # Function to handle PDF upload and text extraction
# def extract_text_from_pdf(file):
# pdf_reader = PdfReader(file)
# text = ""
# for page in pdf_reader.pages:
# text += page.extract_text()
# return text
# # Function to update knowledge base with new content from PDF
# def update_knowledge_base(pdf_text):
# global documents, document_embeddings
# documents.append(pdf_text)
# document_embeddings = retriever.encode(documents, convert_to_tensor=True)
# # Streamlit app layout
# st.title("RAG-based Question Answering App")
# st.write("Upload a PDF, ask questions based on its content, and get answers!")
# # Upload PDF file
# uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
# if uploaded_file:
# pdf_text = extract_text_from_pdf(uploaded_file)
# update_knowledge_base(pdf_text)
# st.write("PDF content successfully added to the knowledge base.")
# # Question input
# question = st.text_input("Enter your question:")
# if question:
# retrieved_context = retrieve(question)
# if retrieved_context:
# answer = generate_response(question, retrieved_context)
# else:
# answer = "I have no knowledge about this topic."
# st.write("Answer:", answer)
|