abdullahzunorain commited on
Commit
589047e
·
verified ·
1 Parent(s): a6e2eb4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -73
app.py CHANGED
@@ -1,82 +1,160 @@
 
 
1
  import os
 
2
  import streamlit as st
3
- from sentence_transformers import SentenceTransformer, util
4
- from groq import Groq
5
- from PyPDF2 import PdfReader
6
-
7
-
8
-
9
- # Initialize the retriever and Groq client
10
- retriever = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
11
- # client = Groq(api_key=groq_api) # Replace with your actual Groq API key
12
- key = os.getenv("groq_api")
13
- client = Groq(api_key = key)
14
-
15
- # Knowledge base (documents) and embeddings
16
- documents = [
17
- "Retrieval-Augmented Generation (RAG) is an AI framework that combines the strengths of retrieval-based and generative models.",
18
- "The main components of a RAG system are the retriever and the generator.",
19
- "A key benefit of Retrieval-Augmented Generation is that it can produce more accurate responses compared to standalone generative models.",
20
- "The retrieval process in a RAG system often relies on embedding-based models, like Sentence-BERT or DPR.",
21
- "Common use cases of RAG include chatbots, customer support systems, and knowledge retrieval for business intelligence."
22
- ]
23
- document_embeddings = retriever.encode(documents, convert_to_tensor=True)
24
-
25
- # Function to retrieve top relevant document and truncate context if too long
26
- def retrieve(query, top_k=1, max_tokens=100):
27
- query_embedding = retriever.encode(query, convert_to_tensor=True)
28
- hits = util.semantic_search(query_embedding, document_embeddings, top_k=top_k)
29
- top_docs = [documents[hit['corpus_id']] for hit in hits[0]]
30
-
31
- # Truncate context to max_tokens if necessary
32
- context = top_docs[0] if hits[0] else ""
33
- context = ' '.join(context.split()[:max_tokens]) # Limit to max_tokens words
34
- return context
35
-
36
- # Function to generate response using Groq
37
- def generate_response(query, context):
38
- response = client.chat.completions.create(
39
- messages=[
40
- {
41
- "role": "user",
42
- "content": f"Context: {context} Question: {query} Answer:"
43
- }
44
- ],
45
- model="gemma2-9b-it"
46
- )
47
- return response.choices[0].message.content
48
-
49
- # Function to handle PDF upload and text extraction
50
- def extract_text_from_pdf(file):
51
- pdf_reader = PdfReader(file)
52
  text = ""
53
- for page in pdf_reader.pages:
54
- text += page.extract_text()
 
55
  return text
56
 
57
- # Function to update knowledge base with new content from PDF
58
- def update_knowledge_base(pdf_text):
59
- global documents, document_embeddings
60
- documents.append(pdf_text)
61
- document_embeddings = retriever.encode(documents, convert_to_tensor=True)
62
 
63
- # Streamlit app layout
64
- st.title("RAG-based Question Answering App")
65
- st.write("Upload a PDF, ask questions based on its content, and get answers!")
 
 
66
 
67
- # Upload PDF file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
69
- if uploaded_file:
 
 
70
  pdf_text = extract_text_from_pdf(uploaded_file)
71
- update_knowledge_base(pdf_text)
72
- st.write("PDF content successfully added to the knowledge base.")
73
-
74
- # Question input
75
- question = st.text_input("Enter your question:")
76
- if question:
77
- retrieved_context = retrieve(question)
78
- if retrieved_context:
79
- answer = generate_response(question, retrieved_context)
80
- else:
81
- answer = "I have no knowledge about this topic."
82
- st.write("Answer:", answer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import faiss
3
  import os
4
+ from sentence_transformers import SentenceTransformer
5
  import streamlit as st
6
+ from groq import Groq # Import Groq client library
7
+
8
+ # Initialize the Groq API client
9
+ groq_api_key = os.getenv("groq_api") # Set your Groq API key as an environment variable
10
+ client = Groq(api_key=groq_api_key)
11
+
12
+ # Initialize sentence transformer model and vector store
13
+ embedder = SentenceTransformer('all-MiniLM-L6-v2')
14
+ dimension = 384 # Dimension of embeddings in all-MiniLM-L6-v2
15
+ index = faiss.IndexFlatL2(dimension)
16
+
17
+ # PDF processing function
18
+ def extract_text_from_pdf(pdf_file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  text = ""
20
+ with fitz.open(pdf_file) as pdf:
21
+ for page in pdf:
22
+ text += page.get_text()
23
  return text
24
 
25
+ # Split text into chunks for embedding
26
+ def split_text(text, chunk_size=512):
27
+ words = text.split()
28
+ return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
 
29
 
30
+ # Embed and add chunks to FAISS index
31
+ def embed_and_store_chunks(chunks):
32
+ embeddings = embedder.encode(chunks)
33
+ index.add(embeddings)
34
+ return embeddings
35
 
36
+ # Retrieve the most relevant chunks
37
+ def retrieve_chunks(question, top_k=3):
38
+ question_embedding = embedder.encode([question])
39
+ distances, indices = index.search(question_embedding, top_k)
40
+ retrieved_chunks = [chunks[idx] for idx in indices[0]]
41
+ return " ".join(retrieved_chunks)
42
+
43
+ # Generate answer using Groq API
44
+ def generate_answer(question, context):
45
+ prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
46
+ response = client.generate(prompt=prompt, max_tokens=100, temperature=0.7)
47
+ return response["choices"][0]["text"].strip()
48
+
49
+ # Streamlit app
50
+ st.title("PDF Question-Answer Chatbot with RAG using Groq API")
51
+
52
+ # File uploader
53
  uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
54
+
55
+ if uploaded_file is not None:
56
+ # Extract text from the PDF file
57
  pdf_text = extract_text_from_pdf(uploaded_file)
58
+ # Split the text and embed/store chunks in FAISS
59
+ chunks = split_text(pdf_text)
60
+ embed_and_store_chunks(chunks)
61
+ st.success("PDF processed and knowledge base created!")
62
+
63
+ # User question input
64
+ question = st.text_input("Ask a question about the PDF content:")
65
+ if question:
66
+ # Retrieve relevant context and generate answer
67
+ context = retrieve_chunks(question)
68
+ answer = generate_answer(question, context)
69
+ st.write("Answer:", answer)
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+ # import os
80
+ # import streamlit as st
81
+ # from sentence_transformers import SentenceTransformer, util
82
+ # from groq import Groq
83
+ # from PyPDF2 import PdfReader
84
+
85
+
86
+
87
+ # # Initialize the retriever and Groq client
88
+ # retriever = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
89
+ # # client = Groq(api_key=groq_api) # Replace with your actual Groq API key
90
+ # key = os.getenv("groq_api")
91
+ # client = Groq(api_key = key)
92
+
93
+ # # Knowledge base (documents) and embeddings
94
+ # documents = [
95
+ # "Retrieval-Augmented Generation (RAG) is an AI framework that combines the strengths of retrieval-based and generative models.",
96
+ # "The main components of a RAG system are the retriever and the generator.",
97
+ # "A key benefit of Retrieval-Augmented Generation is that it can produce more accurate responses compared to standalone generative models.",
98
+ # "The retrieval process in a RAG system often relies on embedding-based models, like Sentence-BERT or DPR.",
99
+ # "Common use cases of RAG include chatbots, customer support systems, and knowledge retrieval for business intelligence."
100
+ # ]
101
+ # document_embeddings = retriever.encode(documents, convert_to_tensor=True)
102
+
103
+ # # Function to retrieve top relevant document and truncate context if too long
104
+ # def retrieve(query, top_k=1, max_tokens=100):
105
+ # query_embedding = retriever.encode(query, convert_to_tensor=True)
106
+ # hits = util.semantic_search(query_embedding, document_embeddings, top_k=top_k)
107
+ # top_docs = [documents[hit['corpus_id']] for hit in hits[0]]
108
+
109
+ # # Truncate context to max_tokens if necessary
110
+ # context = top_docs[0] if hits[0] else ""
111
+ # context = ' '.join(context.split()[:max_tokens]) # Limit to max_tokens words
112
+ # return context
113
+
114
+ # # Function to generate response using Groq
115
+ # def generate_response(query, context):
116
+ # response = client.chat.completions.create(
117
+ # messages=[
118
+ # {
119
+ # "role": "user",
120
+ # "content": f"Context: {context} Question: {query} Answer:"
121
+ # }
122
+ # ],
123
+ # model="gemma2-9b-it"
124
+ # )
125
+ # return response.choices[0].message.content
126
+
127
+ # # Function to handle PDF upload and text extraction
128
+ # def extract_text_from_pdf(file):
129
+ # pdf_reader = PdfReader(file)
130
+ # text = ""
131
+ # for page in pdf_reader.pages:
132
+ # text += page.extract_text()
133
+ # return text
134
+
135
+ # # Function to update knowledge base with new content from PDF
136
+ # def update_knowledge_base(pdf_text):
137
+ # global documents, document_embeddings
138
+ # documents.append(pdf_text)
139
+ # document_embeddings = retriever.encode(documents, convert_to_tensor=True)
140
+
141
+ # # Streamlit app layout
142
+ # st.title("RAG-based Question Answering App")
143
+ # st.write("Upload a PDF, ask questions based on its content, and get answers!")
144
+
145
+ # # Upload PDF file
146
+ # uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
147
+ # if uploaded_file:
148
+ # pdf_text = extract_text_from_pdf(uploaded_file)
149
+ # update_knowledge_base(pdf_text)
150
+ # st.write("PDF content successfully added to the knowledge base.")
151
+
152
+ # # Question input
153
+ # question = st.text_input("Enter your question:")
154
+ # if question:
155
+ # retrieved_context = retrieve(question)
156
+ # if retrieved_context:
157
+ # answer = generate_response(question, retrieved_context)
158
+ # else:
159
+ # answer = "I have no knowledge about this topic."
160
+ # st.write("Answer:", answer)