Spaces:
Sleeping
Sleeping
from flask import Flask, render_template, request, jsonify | |
import faiss | |
import numpy as np | |
import json | |
from sentence_transformers import SentenceTransformer | |
from langchain.prompts import PromptTemplate | |
from langchain_groq import ChatGroq | |
import re | |
import faiss | |
import numpy as np | |
import json | |
from sentence_transformers import SentenceTransformer | |
from dotenv import load_dotenv | |
import fitz # PyMuPDF for text extraction | |
from pdf2image import convert_from_path | |
import json | |
import os | |
load_dotenv() | |
def extract_text_images(pdf_path, output_dir="static/output_images"): | |
doc = fitz.open(pdf_path) | |
data = [] | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
for page_num in range(len(doc)): | |
page = doc[page_num] | |
text = page.get_text("text") | |
images = page.get_images(full=True) | |
image_paths = [] | |
for img_index, img in enumerate(images): | |
xref = img[0] | |
base_image = doc.extract_image(xref) | |
image_bytes = base_image["image"] | |
image_ext = base_image["ext"] | |
image_filename = f"{output_dir}/page_{page_num+1}_img_{img_index+1}.{image_ext}" | |
with open(image_filename, "wb") as img_file: | |
img_file.write(image_bytes) | |
image_paths.append(image_filename) | |
data.append({"page": page_num + 1, "text": text, "images": image_paths}) | |
with open("pdf_data.json", "w") as f: | |
json.dump(data, f, indent=4) | |
return "Extraction completed!" | |
pdf_path = "./Exelsys easyHR v10 User Guide.pdf" | |
extract_text_images(pdf_path) | |
# Load Hugging Face model | |
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
def get_embedding(text): | |
return model.encode(text, convert_to_numpy=True) | |
def store_embeddings(): | |
with open("pdf_data.json") as f: | |
data = json.load(f) | |
dimension = 384 | |
index = faiss.IndexFlatL2(dimension) | |
metadata = [] | |
for i, entry in enumerate(data): | |
embedding = np.array(get_embedding(entry["text"])).astype("float32") | |
index.add(np.array([embedding])) | |
metadata.append({"page": entry["page"], "text": entry["text"], "images": entry["images"]}) | |
faiss.write_index(index, "faiss_index.bin") | |
with open("metadata.json", "w") as f: | |
json.dump(metadata, f, indent=4) | |
return "Embeddings stored successfully!" | |
store_embeddings() | |
app = Flask(__name__) | |
# Load Model and FAISS Index | |
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
index = faiss.read_index("faiss_index.bin") | |
groq_api_key = os.getenv('GROQ_API_KEY') | |
model_name = "llama-3.3-70b-versatile" | |
llm = ChatGroq( | |
temperature=0, | |
groq_api_key=groq_api_key, | |
model_name=model_name | |
) | |
with open("metadata.json") as f: | |
metadata = json.load(f) | |
def categorize_query(query): | |
""" | |
Categorizes user queries into different types (greetings, small talk, unrelated, etc.). | |
""" | |
query = query.lower().strip() | |
# Greetings | |
greeting_patterns = [ | |
r"\bhello\b", r"\bhi\b", r"\bhey\b", r"\bhola\b", r"\bgreetings\b", | |
r"\bwhat('s| is) up\b", r"\bhowdy\b", r"\bhiya\b", r"\byo\b", | |
r"\bgood (morning|afternoon|evening|day|night)\b", | |
r"\bhow (are|r) you\b", r"\bhow's it going\b", r"\bhow have you been\b", | |
r"\bhope you are (doing )?(well|good|fine)\b", r"\bnice to meet you\b", | |
r"\bpleased to meet you\b" | |
] | |
# Thank-you messages | |
thank_you_patterns = [ | |
r"\bthank(s| you)\b", r"\bthanks a lot\b", r"\bthanks so much\b", | |
r"\bthank you very much\b", r"\bappreciate it\b", r"\bmuch obliged\b", | |
r"\bgrateful\b", r"\bcheers\b" | |
] | |
# Small talk | |
small_talk_patterns = [ | |
r"\bhow (are|r) you\b", r"\bhow's your day\b", r"\bwhat's up\b", | |
r"\bhow's it going\b", r"\bhow have you been\b", r"\bhope you are well\b" | |
] | |
# Unrelated topics | |
unrelated_patterns = [ | |
r"\btell me a joke\b", r"\bwho won\b", r"\bwhat is ai\b", r"\bexplain blockchain\b" | |
] | |
# Goodbye messages | |
goodbye_patterns = [ | |
r"\bbye\b", r"\bgoodbye\b", r"\bsee you\b", r"\bhave a nice day\b" | |
] | |
# Rude or inappropriate messages | |
rude_patterns = [ | |
r"\bstupid\b", r"\bdumb\b", r"\buseless\b", r"\bshut up\b" | |
] | |
if any(re.search(pattern, query) for pattern in greeting_patterns): | |
return "greeting" | |
if any(re.search(pattern, query) for pattern in thank_you_patterns): | |
return "thank_you" | |
if any(re.search(pattern, query) for pattern in small_talk_patterns): | |
return "small_talk" | |
if any(re.search(pattern, query) for pattern in unrelated_patterns): | |
return "unrelated" | |
if any(re.search(pattern, query) for pattern in goodbye_patterns): | |
return "goodbye" | |
if any(re.search(pattern, query) for pattern in rude_patterns): | |
return "rude" | |
return "normal" | |
# Function to Search for Relevant Answers | |
def search_text(query, top_k=2): | |
query_embedding = np.array(model.encode(query, convert_to_numpy=True)).astype("float32").reshape(1, -1) | |
distances, indices = index.search(query_embedding, top_k) | |
results = [] | |
for idx in indices[0]: | |
if idx >= 0: | |
results.append(metadata[idx]) | |
return results | |
# Serve HTML Page | |
def home(): | |
return render_template("index.html") | |
def query_pdf(): | |
query = request.json.get("query") | |
query_type = categorize_query(query) | |
if query_type == "greeting": | |
return jsonify({"text": "Hello! How can I assist you with Exelsys EasyHR?", "images": []}) | |
if query_type == "thank_you": | |
return jsonify({"text": "You're welcome! How can I assist you further?", "images": []}) | |
if query_type == "small_talk": | |
return jsonify({"text": "I'm here to assist with Exelsys EasyHR. How can I help?", "images": []}) | |
if query_type == "unrelated": | |
return jsonify({"text": "I'm here to assist with Exelsys easyHR queries only.", "images": []}) | |
if query_type == "vague": | |
return jsonify({"text": "Could you please provide more details?", "images": []}) | |
if query_type == "goodbye": | |
return jsonify({"text": "You're welcome! Have a great day!", "images": []}) | |
if query_type == "rude": | |
return jsonify({"text": "I'm here to assist you professionally.", "images": []}) | |
# Search for relevant PDF content using FAISS | |
results = search_text(query, top_k=3) | |
if not results: | |
return jsonify({"text": "No relevant results found in the PDF.", "images": []}) | |
# Merge multiple text results | |
retrieved_text = "\n\n---\n\n".join([res["text"] for res in results]) | |
print(retrieved_text) | |
prompt_extract = PromptTemplate.from_template( | |
""" | |
### YOU ARE AN EXELSYS EASYHR GUIDE ASSISTANT: | |
### INSTRUCTIONS: | |
- Your job is to provide step-by-step guidance for the following user query. | |
- Base your response **only** on the retrieved context from the PDF. | |
- If no relevant information is found, simply respond with: "Not found." | |
- If the user greets you (e.g., "Hello", "Hi", "Good morning"), respond politely but keep it brief. | |
- If the query is unrelated to Exelsys easyHR, respond with: "I'm here to assist with Exelsys easyHR queries only." | |
### USER QUERY: | |
{query} | |
### CONTEXT FROM PDF: | |
{retrieved_text} | |
### ANSWER: | |
""" | |
) | |
# Chain the prompt with ChatGroq | |
chain_extract = prompt_extract | llm | |
chat_response = chain_extract.invoke({"query": query, "retrieved_text": retrieved_text}) | |
# Convert response to string | |
response_text = str(chat_response.content) | |
# Determine if images should be included | |
retrieved_images = [] | |
if "Not found." not in response_text and "I'm here to assist" not in response_text: | |
retrieved_images = [img for res in results if "images" in res for img in res["images"]] | |
# Final response JSON | |
response = { | |
"text": response_text, | |
"images": retrieved_images | |
} | |
return jsonify(response) | |
if __name__ == "__main__": | |
app.run(host="0.0.0.0", port=7860) | |