Spaces:
Sleeping
Sleeping
from sentence_transformers import SentenceTransformer | |
from scipy.spatial.distance import cosine | |
import numpy as np | |
from data_ret import search_relevant_data # Assuming this function fetches the data from some source | |
import streamlit as st | |
# Load the Sentence Transformer model for similarity search | |
def load_similarity_model(): | |
st.write("Loading similarity model...") # Show status on Streamlit | |
retriever_model = SentenceTransformer("all-mpnet-base-v2") | |
st.write("Similarity model loaded.") | |
return retriever_model | |
# Create embeddings for the retrieved documents | |
def create_embeddings(documents, model): | |
if not documents: | |
st.write("No documents provided for embedding.") | |
return np.array([]) # Return empty array if no documents | |
st.write(f"Creating embeddings for {len(documents)} documents...") # Show progress | |
embeddings = [] | |
# Track progress of the embedding creation using Streamlit's progress bar | |
progress_bar = st.progress(0) | |
step = 1 / len(documents) # This ensures the progress bar value stays within [0.0, 1.0] | |
# Include 'text' in the document text along with 'question' and 'answer' | |
document_texts = [doc['question'] + " " + doc['answer'] + " " + doc.get('text', '') for doc in documents] | |
for i, doc_text in enumerate(document_texts): | |
embedding = model.encode(doc_text) | |
embeddings.append(embedding) | |
progress_bar.progress(i * step) # Update the progress bar within valid range | |
embeddings = np.array(embeddings) | |
st.write(f"Embeddings created with shape: {embeddings.shape}") | |
return embeddings | |
# Retrieve documents based on the question embedding | |
def retrieve_documents(question_embedding, document_embeddings, top_k=5): | |
if document_embeddings.size == 0: | |
st.write("No document embeddings available for retrieval.") | |
return [] | |
st.write("Calculating similarities between question and documents...") | |
similarities = np.array([1 - cosine(question_embedding, doc_embedding) for doc_embedding in document_embeddings]) | |
# Get indices of top K similarities (highest similarity first) | |
top_indices = similarities.argsort()[-top_k:][::-1] # Sort in descending order | |
return top_indices | |
# Main function to get the context from the most relevant documents based on topic and question | |
def get_relevant_context(question, topic): | |
try: | |
st.write("Searching for relevant documents based on the topic...") | |
relevant_documents = search_relevant_data(topic) # Use dynamic topic for search query | |
st.write(f"Found {len(relevant_documents)} relevant documents.") | |
if not relevant_documents: | |
return "No relevant documents found." | |
retriever_model = load_similarity_model() # Load the similarity model | |
# Create document embeddings and show progress | |
document_embeddings = create_embeddings(relevant_documents, retriever_model) | |
if document_embeddings.size == 0: | |
return "No embeddings created for relevant documents." | |
st.write("Generating question embedding and retrieving relevant documents...") | |
question_embedding = retriever_model.encode(question) | |
relevant_doc_indices = retrieve_documents(question_embedding, document_embeddings) | |
if len(relevant_doc_indices) == 0: | |
return "No relevant documents found after embedding." | |
# Extract context from the top relevant documents | |
contexts = [] | |
for idx in relevant_doc_indices: | |
doc = relevant_documents[idx] | |
context = doc.get('answer', '') + " " + doc.get('text', '') | |
if context.strip(): | |
contexts.append(context) | |
if not contexts: | |
return "No valid contexts available for answering." | |
# Return the combined context for question answering | |
return " ".join(contexts) | |
except Exception as e: | |
st.write(f"Error processing question: {str(e)}") | |
return f"Error: {str(e)}" | |