InkeyDevelopment commited on
Commit
77c8f74
·
verified ·
1 Parent(s): 40e893c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +265 -0
app.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request, jsonify
2
+ import faiss
3
+ import numpy as np
4
+ import json
5
+ from sentence_transformers import SentenceTransformer
6
+ from langchain.prompts import PromptTemplate
7
+ from langchain_groq import ChatGroq
8
+ import re
9
+ import faiss
10
+ import numpy as np
11
+ import json
12
+ from sentence_transformers import SentenceTransformer
13
+ from dotenv import load_dotenv
14
+ import fitz # PyMuPDF for text extraction
15
+ from pdf2image import convert_from_path
16
+ import json
17
+ import os
18
+ load_dotenv()
19
+
20
+ def extract_text_images(pdf_path, output_dir="static/output_images"):
21
+ doc = fitz.open(pdf_path)
22
+ data = []
23
+
24
+ if not os.path.exists(output_dir):
25
+ os.makedirs(output_dir)
26
+
27
+ for page_num in range(len(doc)):
28
+ page = doc[page_num]
29
+ text = page.get_text("text")
30
+
31
+ images = page.get_images(full=True)
32
+ image_paths = []
33
+
34
+ for img_index, img in enumerate(images):
35
+ xref = img[0]
36
+ base_image = doc.extract_image(xref)
37
+ image_bytes = base_image["image"]
38
+ image_ext = base_image["ext"]
39
+ image_filename = f"{output_dir}/page_{page_num+1}_img_{img_index+1}.{image_ext}"
40
+
41
+ with open(image_filename, "wb") as img_file:
42
+ img_file.write(image_bytes)
43
+
44
+ image_paths.append(image_filename)
45
+
46
+ data.append({"page": page_num + 1, "text": text, "images": image_paths})
47
+
48
+ with open("pdf_data.json", "w") as f:
49
+ json.dump(data, f, indent=4)
50
+
51
+ return "Extraction completed!"
52
+
53
+ pdf_path = "./Exelsys easyHR v10 User Guide.pdf"
54
+ extract_text_images(pdf_path)
55
+
56
+
57
+ # Load Hugging Face model
58
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
59
+
60
+ def get_embedding(text):
61
+ return model.encode(text, convert_to_numpy=True)
62
+
63
+ def store_embeddings():
64
+ with open("pdf_data.json") as f:
65
+ data = json.load(f)
66
+
67
+ dimension = 384
68
+ index = faiss.IndexFlatL2(dimension)
69
+ metadata = []
70
+
71
+ for i, entry in enumerate(data):
72
+ embedding = np.array(get_embedding(entry["text"])).astype("float32")
73
+ index.add(np.array([embedding]))
74
+ metadata.append({"page": entry["page"], "text": entry["text"], "images": entry["images"]})
75
+
76
+ faiss.write_index(index, "faiss_index.bin")
77
+
78
+ with open("metadata.json", "w") as f:
79
+ json.dump(metadata, f, indent=4)
80
+
81
+ return "Embeddings stored successfully!"
82
+
83
+ store_embeddings()
84
+
85
+
86
+
87
+ app = Flask(__name__)
88
+
89
+ # Load Model and FAISS Index
90
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
91
+ index = faiss.read_index("faiss_index.bin")
92
+
93
+ groq_api_key = os.getenv('GROQ_API_KEY')
94
+ model_name = "llama-3.3-70b-versatile"
95
+
96
+ llm = ChatGroq(
97
+ temperature=0,
98
+ groq_api_key=groq_api_key,
99
+ model_name=model_name
100
+ )
101
+
102
+ with open("metadata.json") as f:
103
+ metadata = json.load(f)
104
+
105
+
106
+ def categorize_query(query):
107
+ """
108
+ Categorizes user queries into different types (greetings, small talk, unrelated, etc.).
109
+ """
110
+ query = query.lower().strip()
111
+
112
+ # Greetings
113
+ greeting_patterns = [
114
+ r"\bhello\b", r"\bhi\b", r"\bhey\b", r"\bhola\b", r"\bgreetings\b",
115
+ r"\bwhat('s| is) up\b", r"\bhowdy\b", r"\bhiya\b", r"\byo\b",
116
+ r"\bgood (morning|afternoon|evening|day|night)\b",
117
+ r"\bhow (are|r) you\b", r"\bhow's it going\b", r"\bhow have you been\b",
118
+ r"\bhope you are (doing )?(well|good|fine)\b", r"\bnice to meet you\b",
119
+ r"\bpleased to meet you\b"
120
+ ]
121
+
122
+ # Thank-you messages
123
+ thank_you_patterns = [
124
+ r"\bthank(s| you)\b", r"\bthanks a lot\b", r"\bthanks so much\b",
125
+ r"\bthank you very much\b", r"\bappreciate it\b", r"\bmuch obliged\b",
126
+ r"\bgrateful\b", r"\bcheers\b"
127
+ ]
128
+
129
+ # Small talk
130
+ small_talk_patterns = [
131
+ r"\bhow (are|r) you\b", r"\bhow's your day\b", r"\bwhat's up\b",
132
+ r"\bhow's it going\b", r"\bhow have you been\b", r"\bhope you are well\b"
133
+ ]
134
+
135
+ # Unrelated topics
136
+ unrelated_patterns = [
137
+ r"\btell me a joke\b", r"\bwho won\b", r"\bwhat is ai\b", r"\bexplain blockchain\b"
138
+ ]
139
+
140
+
141
+ # Goodbye messages
142
+ goodbye_patterns = [
143
+ r"\bbye\b", r"\bgoodbye\b", r"\bsee you\b", r"\bhave a nice day\b"
144
+ ]
145
+
146
+ # Rude or inappropriate messages
147
+ rude_patterns = [
148
+ r"\bstupid\b", r"\bdumb\b", r"\buseless\b", r"\bshut up\b"
149
+ ]
150
+
151
+ if any(re.search(pattern, query) for pattern in greeting_patterns):
152
+ return "greeting"
153
+ if any(re.search(pattern, query) for pattern in thank_you_patterns):
154
+ return "thank_you"
155
+ if any(re.search(pattern, query) for pattern in small_talk_patterns):
156
+ return "small_talk"
157
+ if any(re.search(pattern, query) for pattern in unrelated_patterns):
158
+ return "unrelated"
159
+ if any(re.search(pattern, query) for pattern in goodbye_patterns):
160
+ return "goodbye"
161
+ if any(re.search(pattern, query) for pattern in rude_patterns):
162
+ return "rude"
163
+
164
+ return "normal"
165
+
166
+ # Function to Search for Relevant Answers
167
+ def search_text(query, top_k=2):
168
+ query_embedding = np.array(model.encode(query, convert_to_numpy=True)).astype("float32").reshape(1, -1)
169
+ distances, indices = index.search(query_embedding, top_k)
170
+
171
+ results = []
172
+ for idx in indices[0]:
173
+ if idx >= 0:
174
+ results.append(metadata[idx])
175
+
176
+ return results
177
+
178
+ # Serve HTML Page
179
+ @app.route("/")
180
+ def home():
181
+ return render_template("index.html")
182
+
183
+ @app.route("/query", methods=["POST"])
184
+ def query_pdf():
185
+ query = request.json.get("query")
186
+
187
+ query_type = categorize_query(query)
188
+
189
+ if query_type == "greeting":
190
+ return jsonify({"text": "Hello! How can I assist you with Exelsys EasyHR?", "images": []})
191
+
192
+ if query_type == "thank_you":
193
+ return jsonify({"text": "You're welcome! How can I assist you further?", "images": []})
194
+
195
+ if query_type == "small_talk":
196
+ return jsonify({"text": "I'm here to assist with Exelsys EasyHR. How can I help?", "images": []})
197
+
198
+ if query_type == "unrelated":
199
+ return jsonify({"text": "I'm here to assist with Exelsys easyHR queries only.", "images": []})
200
+
201
+ if query_type == "vague":
202
+ return jsonify({"text": "Could you please provide more details?", "images": []})
203
+
204
+ if query_type == "goodbye":
205
+ return jsonify({"text": "You're welcome! Have a great day!", "images": []})
206
+
207
+ if query_type == "rude":
208
+ return jsonify({"text": "I'm here to assist you professionally.", "images": []})
209
+
210
+
211
+
212
+ # Search for relevant PDF content using FAISS
213
+ results = search_text(query, top_k=3)
214
+
215
+ if not results:
216
+ return jsonify({"text": "No relevant results found in the PDF.", "images": []})
217
+
218
+ # Merge multiple text results
219
+ retrieved_text = "\n\n---\n\n".join([res["text"] for res in results])
220
+ print(retrieved_text)
221
+
222
+ prompt_extract = PromptTemplate.from_template(
223
+ """
224
+ ### YOU ARE AN EXELSYS EASYHR GUIDE ASSISTANT:
225
+ ### INSTRUCTIONS:
226
+ - Your job is to provide step-by-step guidance for the following user query.
227
+ - Base your response **only** on the retrieved context from the PDF.
228
+ - If no relevant information is found, simply respond with: "Not found."
229
+ - If the user greets you (e.g., "Hello", "Hi", "Good morning"), respond politely but keep it brief.
230
+ - If the query is unrelated to Exelsys easyHR, respond with: "I'm here to assist with Exelsys easyHR queries only."
231
+
232
+ ### USER QUERY:
233
+ {query}
234
+
235
+ ### CONTEXT FROM PDF:
236
+ {retrieved_text}
237
+
238
+ ### ANSWER:
239
+ """
240
+ )
241
+
242
+ # Chain the prompt with ChatGroq
243
+ chain_extract = prompt_extract | llm
244
+ chat_response = chain_extract.invoke({"query": query, "retrieved_text": retrieved_text})
245
+
246
+ # Convert response to string
247
+ response_text = str(chat_response.content)
248
+
249
+ # Determine if images should be included
250
+ retrieved_images = []
251
+ if "Not found." not in response_text and "I'm here to assist" not in response_text:
252
+ retrieved_images = [img for res in results if "images" in res for img in res["images"]]
253
+
254
+ # Final response JSON
255
+ response = {
256
+ "text": response_text,
257
+ "images": retrieved_images
258
+ }
259
+
260
+ return jsonify(response)
261
+
262
+ if __name__ == "__main__":
263
+ app.run(host="0.0.0.0", port=7860)
264
+
265
+