QA-ContextRetriever / similarity_search.py
Muhammad Adnan
Initial commit of Streamlit app
ae6eb20
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import numpy as np
from data_ret import search_relevant_data # Assuming this function fetches the data from some source
import streamlit as st
# Load the Sentence Transformer model for similarity search
def load_similarity_model():
st.write("Loading similarity model...") # Show status on Streamlit
retriever_model = SentenceTransformer("all-mpnet-base-v2")
st.write("Similarity model loaded.")
return retriever_model
# Create embeddings for the retrieved documents
def create_embeddings(documents, model):
if not documents:
st.write("No documents provided for embedding.")
return np.array([]) # Return empty array if no documents
st.write(f"Creating embeddings for {len(documents)} documents...") # Show progress
embeddings = []
# Track progress of the embedding creation using Streamlit's progress bar
progress_bar = st.progress(0)
step = 1 / len(documents) # This ensures the progress bar value stays within [0.0, 1.0]
# Include 'text' in the document text along with 'question' and 'answer'
document_texts = [doc['question'] + " " + doc['answer'] + " " + doc.get('text', '') for doc in documents]
for i, doc_text in enumerate(document_texts):
embedding = model.encode(doc_text)
embeddings.append(embedding)
progress_bar.progress(i * step) # Update the progress bar within valid range
embeddings = np.array(embeddings)
st.write(f"Embeddings created with shape: {embeddings.shape}")
return embeddings
# Retrieve documents based on the question embedding
def retrieve_documents(question_embedding, document_embeddings, top_k=5):
if document_embeddings.size == 0:
st.write("No document embeddings available for retrieval.")
return []
st.write("Calculating similarities between question and documents...")
similarities = np.array([1 - cosine(question_embedding, doc_embedding) for doc_embedding in document_embeddings])
# Get indices of top K similarities (highest similarity first)
top_indices = similarities.argsort()[-top_k:][::-1] # Sort in descending order
return top_indices
# Main function to get the context from the most relevant documents based on topic and question
def get_relevant_context(question, topic):
try:
st.write("Searching for relevant documents based on the topic...")
relevant_documents = search_relevant_data(topic) # Use dynamic topic for search query
st.write(f"Found {len(relevant_documents)} relevant documents.")
if not relevant_documents:
return "No relevant documents found."
retriever_model = load_similarity_model() # Load the similarity model
# Create document embeddings and show progress
document_embeddings = create_embeddings(relevant_documents, retriever_model)
if document_embeddings.size == 0:
return "No embeddings created for relevant documents."
st.write("Generating question embedding and retrieving relevant documents...")
question_embedding = retriever_model.encode(question)
relevant_doc_indices = retrieve_documents(question_embedding, document_embeddings)
if len(relevant_doc_indices) == 0:
return "No relevant documents found after embedding."
# Extract context from the top relevant documents
contexts = []
for idx in relevant_doc_indices:
doc = relevant_documents[idx]
context = doc.get('answer', '') + " " + doc.get('text', '')
if context.strip():
contexts.append(context)
if not contexts:
return "No valid contexts available for answering."
# Return the combined context for question answering
return " ".join(contexts)
except Exception as e:
st.write(f"Error processing question: {str(e)}")
return f"Error: {str(e)}"