karthikvarunn commited on
Commit
8a0b3b7
·
verified ·
1 Parent(s): c1dfb09

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -5
app.py CHANGED
@@ -31,29 +31,77 @@ def expand_query(query):
31
  return refined_query if refined_query else query
32
 
33
  # 🔹 Hybrid Search (TF-IDF + Semantic Retrieval)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def hybrid_search(query, user_groups, index_name="briefmeta", min_score=0.01, fetch_k=50):
35
  vector_store = PineconeVectorStore(index_name=index_name, embedding=embeddings)
 
 
 
 
 
 
 
 
 
36
  semantic_results = vector_store.max_marginal_relevance_search(query, k=10, fetch_k=fetch_k)
37
 
 
38
  all_texts = [doc.page_content for doc in semantic_results]
39
  vectorizer = TfidfVectorizer(stop_words="english")
40
  tfidf_matrix = vectorizer.fit_transform(all_texts)
41
  query_tfidf = vectorizer.transform([query])
42
  keyword_scores = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
43
 
 
44
  combined_results, seen_ids = [], set()
45
  for i, doc in enumerate(semantic_results):
46
- doc_id, doc_groups = doc.metadata.get("id"), doc.metadata.get("groups", [])
47
  semantic_score = float(doc.metadata.get("score", 0))
48
  keyword_score = float(keyword_scores[i])
49
  final_score = 0.65 * semantic_score + 0.35 * keyword_score # Hybrid score
50
 
51
- if doc_id not in seen_ids and any(group in user_groups for group in doc_groups) and final_score > min_score:
52
  seen_ids.add(doc_id)
53
  doc.metadata["final_score"] = final_score
54
  combined_results.append(doc)
55
 
 
56
  combined_results.sort(key=lambda x: x.metadata["final_score"], reverse=True)
 
57
  return [
58
  {
59
  "doc_id": doc.metadata.get("doc_id", "N/A"),
@@ -66,6 +114,7 @@ def hybrid_search(query, user_groups, index_name="briefmeta", min_score=0.01, fe
66
  for doc in combined_results
67
  ]
68
 
 
69
  # 🔹 Metadata-Weighted Reranking
70
  def rerank(query, context):
71
  reranker = pc.inference.rerank(
@@ -75,9 +124,9 @@ def rerank(query, context):
75
  final_reranked = []
76
  for entry in reranker.data:
77
  doc, score = entry["document"], float(entry["score"])
78
- citation_boost = 1.2 if "high_citations" in doc.get("tags", []) else 1.0
79
- recency_boost = 1.1 if "recent_upload" in doc.get("tags", []) else 1.0
80
- final_score = score * citation_boost * recency_boost
81
  doc["final_score"] = final_score
82
  final_reranked.append(doc)
83
 
 
31
  return refined_query if refined_query else query
32
 
33
  # 🔹 Hybrid Search (TF-IDF + Semantic Retrieval)
34
+ # def hybrid_search(query, user_groups, index_name="briefmeta", min_score=0.01, fetch_k=50):
35
+ # vector_store = PineconeVectorStore(index_name=index_name, embedding=embeddings)
36
+ # semantic_results = vector_store.max_marginal_relevance_search(query, k=10, fetch_k=fetch_k)
37
+
38
+ # all_texts = [doc.page_content for doc in semantic_results]
39
+ # vectorizer = TfidfVectorizer(stop_words="english")
40
+ # tfidf_matrix = vectorizer.fit_transform(all_texts)
41
+ # query_tfidf = vectorizer.transform([query])
42
+ # keyword_scores = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
43
+
44
+ # combined_results, seen_ids = [], set()
45
+ # for i, doc in enumerate(semantic_results):
46
+ # doc_id, doc_groups = doc.metadata.get("id"), doc.metadata.get("groups", [])
47
+ # semantic_score = float(doc.metadata.get("score", 0))
48
+ # keyword_score = float(keyword_scores[i])
49
+ # final_score = 0.65 * semantic_score + 0.35 * keyword_score # Hybrid score
50
+
51
+ # if doc_id not in seen_ids and any(group in user_groups for group in doc_groups) and final_score > min_score:
52
+ # seen_ids.add(doc_id)
53
+ # doc.metadata["final_score"] = final_score
54
+ # combined_results.append(doc)
55
+
56
+ # combined_results.sort(key=lambda x: x.metadata["final_score"], reverse=True)
57
+ # return [
58
+ # {
59
+ # "doc_id": doc.metadata.get("doc_id", "N/A"),
60
+ # "chunk_id": doc.metadata.get("id", "N/A"),
61
+ # "title": doc.metadata.get("source", "N/A"),
62
+ # "text": doc.page_content,
63
+ # "page_number": str(doc.metadata.get("page_number", "N/A")),
64
+ # "score": str(doc.metadata.get("final_score", "N/A")),
65
+ # }
66
+ # for doc in combined_results
67
+ ]
68
+
69
  def hybrid_search(query, user_groups, index_name="briefmeta", min_score=0.01, fetch_k=50):
70
  vector_store = PineconeVectorStore(index_name=index_name, embedding=embeddings)
71
+
72
+ # **Filter chunks by user groups before retrieval**
73
+ filtered_results = vector_store.similarity_search(
74
+ query="", # Empty query just to fetch all documents
75
+ k=fetch_k,
76
+ filter={"groups": {"$in": user_groups}}, # Filter for user-specific chunks
77
+ )
78
+
79
+ # **Perform Semantic Search on Filtered Results**
80
  semantic_results = vector_store.max_marginal_relevance_search(query, k=10, fetch_k=fetch_k)
81
 
82
+ # **TF-IDF Keyword Search**
83
  all_texts = [doc.page_content for doc in semantic_results]
84
  vectorizer = TfidfVectorizer(stop_words="english")
85
  tfidf_matrix = vectorizer.fit_transform(all_texts)
86
  query_tfidf = vectorizer.transform([query])
87
  keyword_scores = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
88
 
89
+ # **Hybrid Score Calculation**
90
  combined_results, seen_ids = [], set()
91
  for i, doc in enumerate(semantic_results):
92
+ doc_id = doc.metadata.get("id")
93
  semantic_score = float(doc.metadata.get("score", 0))
94
  keyword_score = float(keyword_scores[i])
95
  final_score = 0.65 * semantic_score + 0.35 * keyword_score # Hybrid score
96
 
97
+ if doc_id not in seen_ids and final_score > min_score:
98
  seen_ids.add(doc_id)
99
  doc.metadata["final_score"] = final_score
100
  combined_results.append(doc)
101
 
102
+ # **Sort Results by Final Score**
103
  combined_results.sort(key=lambda x: x.metadata["final_score"], reverse=True)
104
+
105
  return [
106
  {
107
  "doc_id": doc.metadata.get("doc_id", "N/A"),
 
114
  for doc in combined_results
115
  ]
116
 
117
+
118
  # 🔹 Metadata-Weighted Reranking
119
  def rerank(query, context):
120
  reranker = pc.inference.rerank(
 
124
  final_reranked = []
125
  for entry in reranker.data:
126
  doc, score = entry["document"], float(entry["score"])
127
+ # citation_boost = 1.2 if "high_citations" in doc.get("tags", []) else 1.0
128
+ # recency_boost = 1.1 if "recent_upload" in doc.get("tags", []) else 1.0
129
+ # final_score = score * citation_boost * recency_boost
130
  doc["final_score"] = final_score
131
  final_reranked.append(doc)
132