File size: 5,805 Bytes
589047e
 
e625c0b
589047e
e625c0b
589047e
 
 
 
 
 
 
 
 
 
 
 
 
e625c0b
589047e
 
 
e625c0b
 
589047e
 
 
 
e625c0b
589047e
 
 
 
 
e625c0b
589047e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e625c0b
589047e
 
 
e625c0b
589047e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import fitz  # PyMuPDF
import faiss
import os
from sentence_transformers import SentenceTransformer
import streamlit as st
from groq import Groq  # Import Groq client library

# Initialize the Groq API client
groq_api_key = os.getenv("groq_api")  # Set your Groq API key as an environment variable
client = Groq(api_key=groq_api_key)

# Initialize sentence transformer model and vector store
embedder = SentenceTransformer('all-MiniLM-L6-v2')
dimension = 384  # Dimension of embeddings in all-MiniLM-L6-v2
index = faiss.IndexFlatL2(dimension)

# PDF processing function
def extract_text_from_pdf(pdf_file):
    text = ""
    with fitz.open(pdf_file) as pdf:
        for page in pdf:
            text += page.get_text()
    return text

# Split text into chunks for embedding
def split_text(text, chunk_size=512):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Embed and add chunks to FAISS index
def embed_and_store_chunks(chunks):
    embeddings = embedder.encode(chunks)
    index.add(embeddings)
    return embeddings

# Retrieve the most relevant chunks
def retrieve_chunks(question, top_k=3):
    question_embedding = embedder.encode([question])
    distances, indices = index.search(question_embedding, top_k)
    retrieved_chunks = [chunks[idx] for idx in indices[0]]
    return " ".join(retrieved_chunks)

# Generate answer using Groq API
def generate_answer(question, context):
    prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
    response = client.generate(prompt=prompt, max_tokens=100, temperature=0.7)
    return response["choices"][0]["text"].strip()

# Streamlit app
st.title("PDF Question-Answer Chatbot with RAG using Groq API")

# File uploader
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")

if uploaded_file is not None:
    # Extract text from the PDF file
    pdf_text = extract_text_from_pdf(uploaded_file)
    # Split the text and embed/store chunks in FAISS
    chunks = split_text(pdf_text)
    embed_and_store_chunks(chunks)
    st.success("PDF processed and knowledge base created!")

    # User question input
    question = st.text_input("Ask a question about the PDF content:")
    if question:
        # Retrieve relevant context and generate answer
        context = retrieve_chunks(question)
        answer = generate_answer(question, context)
        st.write("Answer:", answer)









# import os
# import streamlit as st
# from sentence_transformers import SentenceTransformer, util
# from groq import Groq
# from PyPDF2 import PdfReader



# # Initialize the retriever and Groq client
# retriever = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# # client = Groq(api_key=groq_api)  # Replace with your actual Groq API key
# key = os.getenv("groq_api")
# client = Groq(api_key = key)

# # Knowledge base (documents) and embeddings
# documents = [
#     "Retrieval-Augmented Generation (RAG) is an AI framework that combines the strengths of retrieval-based and generative models.",
#     "The main components of a RAG system are the retriever and the generator.",
#     "A key benefit of Retrieval-Augmented Generation is that it can produce more accurate responses compared to standalone generative models.",
#     "The retrieval process in a RAG system often relies on embedding-based models, like Sentence-BERT or DPR.",
#     "Common use cases of RAG include chatbots, customer support systems, and knowledge retrieval for business intelligence."
# ]
# document_embeddings = retriever.encode(documents, convert_to_tensor=True)

# # Function to retrieve top relevant document and truncate context if too long
# def retrieve(query, top_k=1, max_tokens=100):
#     query_embedding = retriever.encode(query, convert_to_tensor=True)
#     hits = util.semantic_search(query_embedding, document_embeddings, top_k=top_k)
#     top_docs = [documents[hit['corpus_id']] for hit in hits[0]]
    
#     # Truncate context to max_tokens if necessary
#     context = top_docs[0] if hits[0] else ""
#     context = ' '.join(context.split()[:max_tokens])  # Limit to max_tokens words
#     return context

# # Function to generate response using Groq
# def generate_response(query, context):
#     response = client.chat.completions.create(
#         messages=[
#             {
#                 "role": "user",
#                 "content": f"Context: {context} Question: {query} Answer:"
#             }
#         ],
#         model="gemma2-9b-it"
#     )
#     return response.choices[0].message.content

# # Function to handle PDF upload and text extraction
# def extract_text_from_pdf(file):
#     pdf_reader = PdfReader(file)
#     text = ""
#     for page in pdf_reader.pages:
#         text += page.extract_text()
#     return text

# # Function to update knowledge base with new content from PDF
# def update_knowledge_base(pdf_text):
#     global documents, document_embeddings
#     documents.append(pdf_text)
#     document_embeddings = retriever.encode(documents, convert_to_tensor=True)

# # Streamlit app layout
# st.title("RAG-based Question Answering App")
# st.write("Upload a PDF, ask questions based on its content, and get answers!")

# # Upload PDF file
# uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
# if uploaded_file:
#     pdf_text = extract_text_from_pdf(uploaded_file)
#     update_knowledge_base(pdf_text)
#     st.write("PDF content successfully added to the knowledge base.")

# # Question input
# question = st.text_input("Enter your question:")
# if question:
#     retrieved_context = retrieve(question)
#     if retrieved_context:
#         answer = generate_response(question, retrieved_context)
#     else:
#         answer = "I have no knowledge about this topic."
#     st.write("Answer:", answer)