karthikvarunn commited on
Commit
5a7855f
·
verified ·
1 Parent(s): 687663e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -45
app.py CHANGED
@@ -66,53 +66,63 @@ def expand_query(query):
66
  # for doc in combined_results
67
  # ]
68
 
69
- def hybrid_search(query, user_groups, index_name="briefmeta", min_score=0, fetch_k=50):
70
  vector_store = PineconeVectorStore(index_name=index_name, embedding=embeddings)
71
 
72
- # **Filter chunks by user groups before retrieval**
73
- filtered_results = vector_store.similarity_search(
74
- query="", # Empty query just to fetch all documents
75
- k=fetch_k,
76
- filter={"groups": {"$in": user_groups}}, # Filter for user-specific chunks
77
- )
78
-
79
- # **Perform Semantic Search on Filtered Results**
80
- semantic_results = vector_store.max_marginal_relevance_search(query, k=10, fetch_k=fetch_k)
81
-
82
- # **TF-IDF Keyword Search**
83
- all_texts = [doc.page_content for doc in semantic_results]
84
- vectorizer = TfidfVectorizer(stop_words="english")
85
- tfidf_matrix = vectorizer.fit_transform(all_texts)
86
- query_tfidf = vectorizer.transform([query])
87
- keyword_scores = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
88
-
89
- # **Hybrid Score Calculation**
90
- combined_results, seen_ids = [], set()
91
- for i, doc in enumerate(semantic_results):
92
- doc_id = doc.metadata.get("id")
93
- semantic_score = float(doc.metadata.get("score", 0))
94
- keyword_score = float(keyword_scores[i])
95
- final_score = 0.65 * semantic_score + 0.35 * keyword_score # Hybrid score
96
-
97
- if doc_id not in seen_ids and final_score > min_score:
98
- seen_ids.add(doc_id)
99
- doc.metadata["final_score"] = final_score
100
- combined_results.append(doc)
101
-
102
- # **Sort Results by Final Score**
103
- combined_results.sort(key=lambda x: x.metadata["final_score"], reverse=True)
104
-
105
- return [
106
- {
107
- "doc_id": doc.metadata.get("doc_id", "N/A"),
108
- "chunk_id": doc.metadata.get("id", "N/A"),
109
- "title": doc.metadata.get("source", "N/A"),
110
- "text": doc.page_content,
111
- "page_number": str(doc.metadata.get("page_number", "N/A")),
112
- "score": str(doc.metadata.get("final_score", "N/A")),
113
- }
114
- for doc in combined_results
115
- ]
 
 
 
 
 
 
 
 
 
 
116
 
117
  # 🔹 Metadata-Weighted Reranking
118
  def rerank(query, context):
 
66
  # for doc in combined_results
67
  # ]
68
 
69
+ def hybrid_search(query, user_groups, index_name="briefmeta", min_score=0.01, fetch_k=50):
70
  vector_store = PineconeVectorStore(index_name=index_name, embedding=embeddings)
71
 
72
+ try:
73
+
74
+ filtered_results = vector_store.similarity_search(
75
+ query="", # Empty query just to fetch all documents
76
+ k=fetch_k,
77
+ filter={"groups": {"$in": user_groups}}, # Filter for user-specific chunks
78
+ )
79
+
80
+ if not filtered_results:
81
+ print("No results:")
82
+ return []
83
+ else:
84
+ print(filtered_results)
85
+
86
+ # **2️⃣ Perform Semantic Search on the Filtered Set**
87
+ #semantic_results = vector_store.max_marginal_relevance_search(query, k=10, fetch_k=fetch_k)
88
+
89
+ # **3️⃣ TF-IDF Keyword Search on Filtered Set**
90
+ all_texts = [doc.page_content for doc in filtered_results] # Use filtered docs
91
+ vectorizer = TfidfVectorizer(stop_words="english")
92
+ tfidf_matrix = vectorizer.fit_transform(all_texts)
93
+ query_tfidf = vectorizer.transform([query])
94
+ keyword_scores = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
95
+
96
+ # **4️⃣ Hybrid Score Calculation**
97
+ combined_results, seen_ids = [], set()
98
+ for i, doc in enumerate(filtered_results): # Iterate over filtered results
99
+ doc_id = doc.metadata.get("id")
100
+ semantic_score = float(doc.metadata.get("score", 0))
101
+ keyword_score = float(keyword_scores[i])
102
+ final_score = 0.65 * semantic_score + 0.35 * keyword_score # Hybrid score
103
+
104
+ if doc_id not in seen_ids and final_score > min_score:
105
+ seen_ids.add(doc_id)
106
+ doc.metadata["final_score"] = final_score
107
+ combined_results.append(doc)
108
+
109
+ # **5️⃣ Sort Results by Final Score**
110
+ combined_results.sort(key=lambda x: x.metadata["final_score"], reverse=True)
111
+
112
+ return [
113
+ {
114
+ "doc_id": doc.metadata.get("doc_id", "N/A"),
115
+ "chunk_id": doc.metadata.get("id", "N/A"),
116
+ "title": doc.metadata.get("source", "N/A"),
117
+ "text": doc.page_content,
118
+ "page_number": str(doc.metadata.get("page_number", "N/A")),
119
+ "score": str(doc.metadata.get("final_score", "N/A")),
120
+ }
121
+ for doc in combined_results
122
+ ]
123
+ except Exception as e:
124
+ print(e)
125
+ return
126
 
127
  # 🔹 Metadata-Weighted Reranking
128
  def rerank(query, context):