Spaces:

shrut123
/

PubMedSearch

Sleeping

App Files Files Community

shrut123 commited on Sep 21, 2024

Commit

9a6f924

verified ·

1 Parent(s): 125fa26

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -5

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import streamlit as st
 from pinecone import Pinecone
 from sentence_transformers import SentenceTransformer
 # Title of the Streamlit App
 st.title("Medical Hybrid Search")
@@ -13,7 +14,6 @@ index = None
 def initialize_pinecone():
     api_key = os.getenv('PINECONE_API_KEY')  # Get Pinecone API key from environment variable
     if api_key:
-        # Initialize Pinecone client using the new class instance method
         pc = Pinecone(api_key=api_key)
         return pc
     else:
@@ -23,7 +23,6 @@ def initialize_pinecone():
 # Function to connect to the 'pubmed-splade' index
 def connect_to_index(pc):
     index_name = 'pubmed-splade'  # Hardcoded index name
-    # Connect to the 'pubmed-splade' index
     if index_name in pc.list_indexes().names():
         index = pc.Index(index_name)
         return index
@@ -35,6 +34,17 @@ def connect_to_index(pc):
 def encode_query(model, query_text):
     return model.encode(query_text).tolist()
 # Initialize Pinecone
 pc = initialize_pinecone()
@@ -49,14 +59,30 @@ if pc:
     # Query input
     query_text = st.text_input("Enter a Query to Search", "Can clinicians use the PHQ-9 to assess depression?")
     # Button to encode query and search the Pinecone index
     if st.button("Search Query"):
         if query_text and index:
             dense_vector = encode_query(model, query_text)
             # Search the index
             results = index.query(
-                vector=dense_vector,
                 top_k=3,
                 include_metadata=True
             )
@@ -64,7 +90,7 @@ if pc:
             st.write("### Search Results:")
             for match in results.matches:
                 st.markdown(f"#### Score: **{match.score:.4f}**")
-                st.write(f" #### Result: ** {match.metadata.get('context', 'No context available.')} **")
                 st.write("---")
         else:
             st.error("Please enter a query and ensure the index is initialized.")

 import streamlit as st
 from pinecone import Pinecone
 from sentence_transformers import SentenceTransformer
+import torch
 # Title of the Streamlit App
 st.title("Medical Hybrid Search")
 def initialize_pinecone():
     api_key = os.getenv('PINECONE_API_KEY')  # Get Pinecone API key from environment variable
     if api_key:
         pc = Pinecone(api_key=api_key)
         return pc
     else:
 # Function to connect to the 'pubmed-splade' index
 def connect_to_index(pc):
     index_name = 'pubmed-splade'  # Hardcoded index name
     if index_name in pc.list_indexes().names():
         index = pc.Index(index_name)
         return index
 def encode_query(model, query_text):
     return model.encode(query_text).tolist()
+# Function to create hybrid scaled vectors
+def hybrid_scale(dense, sparse, alpha):
+    if alpha < 0 or alpha > 1:
+        raise ValueError("Alpha must be between 0 and 1")
+    hsparse = {
+        'indices': sparse['indices'],
+        'values': [v * (1 - alpha) for v in sparse['values']]
+    }
+    hdense = [v * alpha for v in dense]
+    return hdense, hsparse
 # Initialize Pinecone
 pc = initialize_pinecone()
     # Query input
     query_text = st.text_input("Enter a Query to Search", "Can clinicians use the PHQ-9 to assess depression?")
+    # Alpha input
+    alpha = st.slider("Set Alpha (for dense and sparse vector balancing)", 0.0, 1.0, 0.5)
     # Button to encode query and search the Pinecone index
     if st.button("Search Query"):
         if query_text and index:
+            # Encode query to get dense and sparse vectors
             dense_vector = encode_query(model, query_text)
+            input_ids = model.tokenizer(query_text, return_tensors='pt')
+            with torch.no_grad():
+                sparse_vector = sparse_model(d_kwargs=input_ids.to(device))['d_rep'].squeeze()
+            # Prepare sparse vector format for Pinecone
+            indices = sparse_vector.nonzero().squeeze().cpu().tolist()
+            values = sparse_vector[indices].cpu().tolist()
+            sparse_dict = {"indices": indices, "values": values}
+            # Scale dense and sparse vectors
+            hdense, hsparse = hybrid_scale(dense_vector, sparse_dict, alpha)
             # Search the index
             results = index.query(
+                vector=hdense,
+                sparse_vector=hsparse,
                 top_k=3,
                 include_metadata=True
             )
             st.write("### Search Results:")
             for match in results.matches:
                 st.markdown(f"#### Score: **{match.score:.4f}**")
+                st.write(f" #### Context: {match.metadata.get('context', 'No context available.')}")
                 st.write("---")
         else:
             st.error("Please enter a query and ensure the index is initialized.")