Spaces:
Running
Running
Jatin Mehra
commited on
Commit
·
52c6dbe
0
Parent(s):
initial
Browse files- app.py +154 -0
- preprocessing.py +128 -0
- requirements.txt +5 -0
app.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import tempfile
|
3 |
+
import streamlit as st
|
4 |
+
from streamlit_chat import message
|
5 |
+
from preprocessing import Model
|
6 |
+
|
7 |
+
# Home Page Setup
|
8 |
+
st.set_page_config(
|
9 |
+
page_title="PDF Insight Pro",
|
10 |
+
page_icon="📄",
|
11 |
+
layout="centered",
|
12 |
+
)
|
13 |
+
|
14 |
+
# Custom CSS for a more polished look
|
15 |
+
st.markdown("""
|
16 |
+
<style>
|
17 |
+
.main {
|
18 |
+
background-color: #f5f5f5;
|
19 |
+
}
|
20 |
+
.stButton button {
|
21 |
+
background-color: #4CAF50;
|
22 |
+
color: white;
|
23 |
+
border-radius: 8px;
|
24 |
+
}
|
25 |
+
.stTextInput input {
|
26 |
+
border-radius: 8px;
|
27 |
+
padding: 10px;
|
28 |
+
}
|
29 |
+
.stFileUploader input {
|
30 |
+
border-radius: 8px;
|
31 |
+
}
|
32 |
+
.stMarkdown h1 {
|
33 |
+
color: #4CAF50;
|
34 |
+
}
|
35 |
+
</style>
|
36 |
+
""", unsafe_allow_html=True)
|
37 |
+
|
38 |
+
# Custom title and header
|
39 |
+
st.title("📄 PDF Insight Pro")
|
40 |
+
st.subheader("Empower Your Documents with AI-Driven Insights")
|
41 |
+
|
42 |
+
def display_messages():
|
43 |
+
"""
|
44 |
+
Displays the chat messages in the Streamlit UI.
|
45 |
+
"""
|
46 |
+
st.subheader("🗨️ Conversation")
|
47 |
+
st.markdown("---")
|
48 |
+
for i, (msg, is_user) in enumerate(st.session_state["messages"]):
|
49 |
+
message(msg, is_user=is_user, key=str(i))
|
50 |
+
st.session_state["process_input_spinner"] = st.empty()
|
51 |
+
|
52 |
+
def process_user_input():
|
53 |
+
"""
|
54 |
+
Processes the user input by generating a response from the assistant.
|
55 |
+
"""
|
56 |
+
if st.session_state["user_input"] and len(st.session_state["user_input"].strip()) > 0:
|
57 |
+
user_input = st.session_state["user_input"].strip()
|
58 |
+
with st.session_state["process_input_spinner"], st.spinner("Analyzing..."):
|
59 |
+
agent_response = st.session_state["assistant"].get_response(
|
60 |
+
user_input,
|
61 |
+
st.session_state["temperature"],
|
62 |
+
st.session_state["max_tokens"],
|
63 |
+
st.session_state["model"]
|
64 |
+
)
|
65 |
+
|
66 |
+
st.session_state["messages"].append((user_input, True))
|
67 |
+
st.session_state["messages"].append((agent_response, False))
|
68 |
+
st.session_state["user_input"] = ""
|
69 |
+
|
70 |
+
def process_file():
|
71 |
+
"""
|
72 |
+
Processes the uploaded PDF file and appends its content to the context.
|
73 |
+
"""
|
74 |
+
for file in st.session_state["file_uploader"]:
|
75 |
+
with tempfile.NamedTemporaryFile(delete=False) as tf:
|
76 |
+
tf.write(file.getbuffer())
|
77 |
+
file_path = tf.name
|
78 |
+
|
79 |
+
with st.session_state["process_file_spinner"], st.spinner(f"Processing {file.name}..."):
|
80 |
+
try:
|
81 |
+
st.session_state["assistant"].add_to_context(file_path)
|
82 |
+
except Exception as e:
|
83 |
+
st.error(f"Failed to process file {file.name}: {str(e)}")
|
84 |
+
os.remove(file_path)
|
85 |
+
|
86 |
+
def main_page():
|
87 |
+
"""
|
88 |
+
Main function to set up the Streamlit UI and handle user interactions.
|
89 |
+
"""
|
90 |
+
# Initialize session state variables
|
91 |
+
if "messages" not in st.session_state:
|
92 |
+
st.session_state["messages"] = []
|
93 |
+
|
94 |
+
if "assistant" not in st.session_state:
|
95 |
+
st.session_state["assistant"] = Model()
|
96 |
+
|
97 |
+
if "user_input" not in st.session_state:
|
98 |
+
st.session_state["user_input"] = ""
|
99 |
+
|
100 |
+
if "temperature" not in st.session_state:
|
101 |
+
st.session_state["temperature"] = 0.5
|
102 |
+
|
103 |
+
if "max_tokens" not in st.session_state:
|
104 |
+
st.session_state["max_tokens"] = 550
|
105 |
+
|
106 |
+
if "model" not in st.session_state:
|
107 |
+
st.session_state["model"] = "llama-3.1-8b-instant"
|
108 |
+
|
109 |
+
# File uploader
|
110 |
+
st.subheader("📤 Upload Your PDF Documents")
|
111 |
+
st.file_uploader(
|
112 |
+
"Choose PDF files to analyze",
|
113 |
+
type=["pdf"],
|
114 |
+
key="file_uploader",
|
115 |
+
on_change=process_file,
|
116 |
+
accept_multiple_files=True,
|
117 |
+
)
|
118 |
+
|
119 |
+
st.session_state["process_file_spinner"] = st.empty()
|
120 |
+
|
121 |
+
# Document management section
|
122 |
+
if st.session_state["assistant"].contexts:
|
123 |
+
st.subheader("🗂️ Manage Uploaded Documents")
|
124 |
+
for i, context in enumerate(st.session_state["assistant"].contexts):
|
125 |
+
st.text_area(f"Document {i+1} Context", context[:500] + "..." if len(context) > 500 else context, height=100)
|
126 |
+
if st.button(f"Remove Document {i+1}"):
|
127 |
+
st.session_state["assistant"].remove_from_context(i)
|
128 |
+
|
129 |
+
# Model settings
|
130 |
+
with st.expander("⚙️ Customize AI Settings", expanded=True):
|
131 |
+
st.slider("Sampling Temperature", min_value=0.0, max_value=1.0, step=0.1, key="temperature", help="Higher values make output more random.")
|
132 |
+
st.slider("Max Tokens", min_value=50, max_value=1000, step=50, key="max_tokens", help="Limits the length of the response.")
|
133 |
+
st.selectbox("Choose AI Model", ["llama-3.1-8b-instant", "llama3-70b-8192", "gemma-7b-it"], key="model")
|
134 |
+
|
135 |
+
# Display messages and input box
|
136 |
+
display_messages()
|
137 |
+
st.text_input("Type your query and hit Enter", key="user_input", on_change=process_user_input, placeholder="Ask something about your documents...")
|
138 |
+
# Developer info and bug report
|
139 |
+
st.subheader("🐞 Bug Report")
|
140 |
+
st.markdown("""
|
141 |
+
If you encounter any bugs or issues while using the app, please send a bug report to the developer. You can include a screenshot (optional) to help identify the problem.\n
|
142 |
+
""")
|
143 |
+
st.subheader("💡 Suggestions")
|
144 |
+
st.markdown("""
|
145 |
+
Suggestions to improve the app's UI and user interface are also welcome. Feel free to reach out to the developer with your suggestions.\n
|
146 |
+
""")
|
147 |
+
st.subheader("👨💻 Developer Info")
|
148 |
+
st.markdown("""
|
149 |
+
**Developer**: Jatin Mehra\n
|
150 |
+
**Email**: [email protected]\n
|
151 |
+
**Mobile**: 9910364780\n
|
152 |
+
""")
|
153 |
+
if __name__ == "__main__":
|
154 |
+
main_page()
|
preprocessing.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import PyPDF2
|
3 |
+
from groq import Groq
|
4 |
+
import streamlit as st
|
5 |
+
from collections import defaultdict
|
6 |
+
|
7 |
+
class Model:
|
8 |
+
"""
|
9 |
+
A class that represents a model for generating responses based on a given context and query.
|
10 |
+
"""
|
11 |
+
|
12 |
+
def __init__(self):
|
13 |
+
"""
|
14 |
+
Initializes the Model object and sets up the Groq client.
|
15 |
+
"""
|
16 |
+
# api_key = os.getenv("GROQ_API_KEY")
|
17 |
+
api_key = st.secrets["GROQ_API_KEY"]
|
18 |
+
if not api_key:
|
19 |
+
raise ValueError("GROQ_API_KEY environment variable is not set.")
|
20 |
+
self.client = Groq(api_key=api_key)
|
21 |
+
self.contexts = []
|
22 |
+
self.cache = defaultdict(dict) # Caching for repeated queries
|
23 |
+
|
24 |
+
def extract_text_from_pdf(self, pdf_file):
|
25 |
+
"""
|
26 |
+
Extracts text from a PDF file.
|
27 |
+
Args:
|
28 |
+
- pdf_file: The file-like object of the PDF.
|
29 |
+
Returns:
|
30 |
+
- text: The extracted text from the PDF file.
|
31 |
+
"""
|
32 |
+
try:
|
33 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
34 |
+
text = ""
|
35 |
+
for page in pdf_reader.pages:
|
36 |
+
text += page.extract_text()
|
37 |
+
return text
|
38 |
+
except Exception as e:
|
39 |
+
raise ValueError(f"Error extracting text: {str(e)}")
|
40 |
+
|
41 |
+
def generate_response(self, context, query, temperature, max_tokens, model):
|
42 |
+
"""
|
43 |
+
Generates a response based on the given context and query.
|
44 |
+
Args:
|
45 |
+
- context: The context for generating the response.
|
46 |
+
- query: The query or question.
|
47 |
+
- temperature: The sampling temperature for response generation.
|
48 |
+
- max_tokens: The maximum number of tokens for the response.
|
49 |
+
- model: The model ID to be used for generating the response.
|
50 |
+
Returns:
|
51 |
+
- response: The generated response.
|
52 |
+
"""
|
53 |
+
# Caching check
|
54 |
+
if query in self.cache and self.cache[query]["context"] == context:
|
55 |
+
return self.cache[query]["response"]
|
56 |
+
|
57 |
+
messages = [
|
58 |
+
{"role": "system", "content": f"Context: {context}"},
|
59 |
+
{"role": "user", "content": query},
|
60 |
+
]
|
61 |
+
try:
|
62 |
+
completion = self.client.chat.completions.create(
|
63 |
+
model=model, # Model ID
|
64 |
+
messages=messages,
|
65 |
+
temperature=temperature,
|
66 |
+
max_tokens=max_tokens,
|
67 |
+
)
|
68 |
+
response = completion.choices[0].message.content
|
69 |
+
self.cache[query]["context"] = context
|
70 |
+
self.cache[query]["response"] = response # Cache the response
|
71 |
+
return response
|
72 |
+
except Exception as e:
|
73 |
+
return f"API request failed: {str(e)}"
|
74 |
+
|
75 |
+
def add_to_context(self, file_path: str):
|
76 |
+
"""
|
77 |
+
Reads a PDF file and appends its content to the context for generating responses.
|
78 |
+
Args:
|
79 |
+
- file_path: The path to the PDF file.
|
80 |
+
"""
|
81 |
+
try:
|
82 |
+
with open(file_path, "rb") as pdf_file:
|
83 |
+
context = self.extract_text_from_pdf(pdf_file)
|
84 |
+
self.contexts.append(context)
|
85 |
+
except Exception as e:
|
86 |
+
raise ValueError(f"Error processing PDF: {str(e)}")
|
87 |
+
|
88 |
+
def remove_from_context(self, index: int):
|
89 |
+
"""
|
90 |
+
Removes a document from the context based on its index.
|
91 |
+
Args:
|
92 |
+
- index: The index of the document to remove.
|
93 |
+
"""
|
94 |
+
if 0 <= index < len(self.contexts):
|
95 |
+
self.contexts.pop(index)
|
96 |
+
else:
|
97 |
+
raise ValueError("Invalid index for removing context.")
|
98 |
+
|
99 |
+
def get_combined_context(self):
|
100 |
+
"""
|
101 |
+
Combines all contexts into a single context string.
|
102 |
+
Returns:
|
103 |
+
- combined_context: The combined context from all documents.
|
104 |
+
"""
|
105 |
+
return "\n".join(self.contexts)
|
106 |
+
|
107 |
+
def get_response(self, question: str, temperature: float, max_tokens: int, model: str):
|
108 |
+
"""
|
109 |
+
Generates a response based on the given question and the current combined context.
|
110 |
+
Args:
|
111 |
+
- question: The user's question.
|
112 |
+
- temperature: The sampling temperature for response generation.
|
113 |
+
- max_tokens: The maximum number of tokens for the response.
|
114 |
+
- model: The model ID to be used for generating the response.
|
115 |
+
Returns:
|
116 |
+
- response: The generated response or a prompt to upload a document.
|
117 |
+
"""
|
118 |
+
if not self.contexts:
|
119 |
+
return "Please upload a document."
|
120 |
+
combined_context = self.get_combined_context()
|
121 |
+
return self.generate_response(combined_context, question, temperature, max_tokens, model)
|
122 |
+
|
123 |
+
def clear(self):
|
124 |
+
"""
|
125 |
+
Clears the current context.
|
126 |
+
"""
|
127 |
+
self.contexts = []
|
128 |
+
self.cache.clear()
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.31.1
|
2 |
+
streamlit-chat==0.1.1
|
3 |
+
langchain-community==0.0.24
|
4 |
+
PyPDF2==3.0.1
|
5 |
+
groq==0.9.0
|