karthikvarunn commited on
Commit
dae202a
Β·
verified Β·
1 Parent(s): 30d3882

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -183
app.py CHANGED
@@ -3,197 +3,152 @@ from dotenv import load_dotenv
3
  from langchain_community.document_loaders import PyPDFLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain.schema import HumanMessage
6
- from langchain_openai import OpenAIEmbeddings
7
  from langchain_voyageai import VoyageAIEmbeddings
8
  from langchain_pinecone import PineconeVectorStore
9
- from langchain_openai import ChatOpenAI
10
  from langchain.prompts import PromptTemplate
11
- from langchain_core.output_parsers import StrOutputParser
12
- from typing import List, Tuple
13
- from langchain.schema import BaseRetriever
14
- from langchain_core.documents import Document
15
- from langchain_core.runnables import chain
16
- from pinecone import Pinecone, ServerlessSpec
17
  import openai
18
- import numpy as np
19
  import gradio as gr
20
 
 
21
  load_dotenv()
22
-
23
- # Initialize OpenAI and Pinecone credentials
24
  openai.api_key = os.environ.get("OPENAI_API_KEY")
25
  pinecone_api_key = os.environ.get("PINECONE_API_KEY")
26
- pinecone_environment = os.environ.get("PINECONE_ENV")
27
  voyage_api_key = os.environ.get("VOYAGE_API_KEY")
28
 
29
  # Initialize Pinecone
30
- try:
31
- pc = Pinecone(api_key=pinecone_api_key)
32
- except Exception as e:
33
- print(f"Error connecting to Pinecone: {str(e)}")
34
-
35
- embeddings = VoyageAIEmbeddings(
36
- voyage_api_key=voyage_api_key, model="voyage-law-2"
37
- )
38
 
 
39
  def expand_query(query):
40
- """
41
- Expands the query to make it more precise using an LLM.
42
- Example: "docs" -> "Find all legal documents related to case law."
43
- """
44
  llm = ChatOpenAI(model="gpt-4", openai_api_key=openai.api_key, temperature=0.3)
45
- prompt = f"Rewrite the following vague search query into a more specific one:\nQuery: {query}\nSpecific Query:"
46
-
47
  refined_query = llm([HumanMessage(content=prompt)]).content.strip()
48
-
49
  return refined_query if refined_query else query
50
-
51
- def search_documents(query, user_groups, index_name="briefmeta", min_score=0.01):
52
- try:
53
- vector_store = PineconeVectorStore(index_name=index_name, embedding=embeddings)
54
-
55
- results = vector_store.max_marginal_relevance_search(query, k=10, fetch_k=30)
56
-
57
- seen_ids = set()
58
- unique_results = []
59
-
60
- for result in results:
61
- unique_id = result.metadata.get("id")
62
- doc_groups = result.metadata.get("groups", [])
63
- score = result.metadata.get("score", 0)
64
-
65
- # Apply user group filtering & score threshold
66
- if unique_id not in seen_ids and any(group in user_groups for group in doc_groups) and score > min_score:
67
- seen_ids.add(unique_id)
68
- unique_results.append(result)
69
-
70
- context = [
71
- {
72
- "doc_id": result.metadata.get("doc_id", "N/A"),
73
- "chunk_id": result.metadata.get("id", "N/A"),
74
- "title": result.metadata.get("source", "N/A"),
75
- "text": result.page_content,
76
- "page_number": str(result.metadata.get("page_number", "N/A")),
77
- "score": str(result.metadata.get("score", "N/A")),
78
- }
79
- for result in unique_results
80
- ]
81
 
82
- return context
83
- except Exception as e:
84
- return [], f"Error searching documents: {str(e)}"
85
-
86
- def rerank(query, context):
87
- result = pc.inference.rerank(
88
- model="bge-reranker-v2-m3",
89
- query=query,
90
- documents=context,
91
- top_n=5,
92
- return_documents=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  )
94
- return result
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  def generate_output(context, query):
97
- try:
98
- llm = ChatOpenAI(model="gpt-4", openai_api_key=openai.api_key, temperature=0.5)
99
-
100
- if not context.strip():
101
- return "I couldn't find relevant information for your query. Could you refine your question?"
102
-
103
- prompt_template = PromptTemplate(
104
- template="""Use the following document context to answer accurately:
105
- Context: {context}
106
- Question: {question}
107
- If the answer is unclear, ask for clarification.
108
- Answer:""",
109
- input_variables=["context", "question"]
110
- )
111
-
112
- prompt = prompt_template.format(context=context, question=query)
113
- response = llm([HumanMessage(content=prompt)]).content.strip()
114
-
115
- return response if response else "No relevant answer found."
116
-
117
- except Exception as e:
118
- return f"Error generating output: {str(e)}"
119
 
120
- def generate_search_summary(search_results, document_titles, query):
121
- """
122
- Generates an intelligent search summary based on retrieved documents.
123
- """
124
- try:
125
- if not search_results:
126
- return "No relevant documents were found for your search. Try refining your query."
127
-
128
- # Extract metadata
129
- num_results = len(document_titles)
130
- doc_titles = [doc.get("title", "Unknown Document") for doc in search_results]
131
- doc_pages = [doc.get("page_number", "N/A") for doc in search_results]
132
- relevance_scores = [float(doc.get("score", 0)) for doc in search_results]
133
-
134
- # Identify recency (to be implemented)
135
- recency_info = ""
136
- if "date_uploaded" in search_results[0]: # Assuming date is available
137
- dates = [doc.get("date_uploaded", "Unknown") for doc in search_results]
138
- recency_info = f"Most recent document uploaded on {max(dates)}."
139
-
140
- # Identify common keywords
141
- common_terms = set()
142
- for doc in search_results:
143
- text_snippet = doc.get("text", "").split()[:50] # Take first 50 words
144
- common_terms.update(text_snippet)
145
-
146
- summary_prompt = f"""
147
- Generate a concise 1-3 sentence summary of the search results.
148
- - User Query: "{query}"
149
- - Matching Documents: {num_results} found
150
- - Titles: {", ".join(set(doc_titles))}
151
- - Pages Referenced: {", ".join(set(doc_pages))}
152
- - Common Terms: {", ".join(list(common_terms)[:10])} (top terms)
153
- - Recency: {recency_info}
154
- - Relevance Scores (0-1): {relevance_scores}
155
-
156
- Provide a clear, user-friendly summary with an action suggestion.
157
- """
158
-
159
- llm = ChatOpenAI(model="gpt-4", openai_api_key=openai.api_key, temperature=0.5)
160
- summary = llm([HumanMessage(content=summary_prompt)]).content.strip()
161
-
162
- return summary if summary else "No intelligent summary available."
163
-
164
- except Exception as e:
165
- return f"Error generating search summary: {str(e)}"
166
 
 
167
  def complete_workflow(query, user_groups, index_name="briefmeta"):
168
  try:
169
- # Expand the query
170
  refined_query = expand_query(query)
171
-
172
- # Proceed with refined query instead of the original
173
- context_data = search_documents(refined_query, user_groups)
174
-
175
- reranked = rerank(refined_query, context_data)
176
-
177
- context_data = []
178
- for i, entry in enumerate(reranked.data):
179
- context_data.append({
180
- 'chunk_id': entry['document']['chunk_id'],
181
- 'doc_id': entry['document']['doc_id'],
182
- 'title': entry['document']['title'],
183
- 'text': entry['document']['text'],
184
- 'page_number': str(entry['document']['page_number']),
185
- 'score': str(entry['score'])
186
- })
187
 
188
  document_titles = list({os.path.basename(doc["title"]) for doc in context_data})
189
  formatted_titles = " " + "\n".join(document_titles)
190
-
191
- total_results = len(context_data)
192
 
193
  results = {
194
  "results": [
195
  {
196
- "natural_language_output": generate_output(doc["text"], refined_query), # Use refined query
197
  "chunk_id": doc["chunk_id"],
198
  "document_id": doc["doc_id"],
199
  "title": doc["title"],
@@ -203,36 +158,30 @@ def complete_workflow(query, user_groups, index_name="briefmeta"):
203
  }
204
  for doc in context_data
205
  ],
206
- "total_results": total_results
 
207
  }
208
 
209
- return results, formatted_titles
 
210
  except Exception as e:
211
- return {"results": [], "total_results": 0}, f"Error in workflow: {str(e)}"
212
 
 
213
  def gradio_app():
214
- with gr.Blocks(css=".result-output {width: 150%; font-size: 16px; padding: 10px;}") as app:
215
- gr.Markdown("### Intelligent Document Search Prototype-v0.2")
216
-
217
- with gr.Row():
218
- user_query = gr.Textbox(label=" Enter Search Query")
219
- user_groups = gr.Textbox(label=" User Groups", placeholder="e.g., ['KarthikPersonal']", interactive=True)
220
- index_name = gr.Textbox(label=" Index Name", placeholder="Default: briefmeta", interactive=True)
221
- search_btn = gr.Button(" Search")
222
-
223
- with gr.Row():
224
- result_output = gr.JSON(label=" Search Results", elem_id="result-output")
225
- with gr.Row():
226
- titles_output = gr.Textbox(label=" Retrieved Document Titles", interactive=False)
227
-
228
- search_btn.click(
229
- complete_workflow,
230
- inputs=[user_query, user_groups, index_name],
231
- outputs=[result_output, titles_output]
232
- )
233
 
234
  return app
235
 
236
-
237
- # Launch the app
238
- gradio_app().launch()
 
3
  from langchain_community.document_loaders import PyPDFLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain.schema import HumanMessage
6
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
7
  from langchain_voyageai import VoyageAIEmbeddings
8
  from langchain_pinecone import PineconeVectorStore
 
9
  from langchain.prompts import PromptTemplate
10
+ from pinecone import Pinecone
11
+ from sklearn.feature_extraction.text import TfidfVectorizer
12
+ from sklearn.metrics.pairwise import cosine_similarity
 
 
 
13
  import openai
 
14
  import gradio as gr
15
 
16
+ # Load API keys
17
  load_dotenv()
 
 
18
  openai.api_key = os.environ.get("OPENAI_API_KEY")
19
  pinecone_api_key = os.environ.get("PINECONE_API_KEY")
 
20
  voyage_api_key = os.environ.get("VOYAGE_API_KEY")
21
 
22
  # Initialize Pinecone
23
+ pc = Pinecone(api_key=pinecone_api_key)
24
+ embeddings = VoyageAIEmbeddings(voyage_api_key=voyage_api_key, model="voyage-law-2")
 
 
 
 
 
 
25
 
26
+ # πŸ”Ή Query Expansion using GPT-4
27
  def expand_query(query):
 
 
 
 
28
  llm = ChatOpenAI(model="gpt-4", openai_api_key=openai.api_key, temperature=0.3)
29
+ prompt = f"Rewrite this vague query into a more specific one:\nQuery: {query}\nSpecific Query:"
 
30
  refined_query = llm([HumanMessage(content=prompt)]).content.strip()
 
31
  return refined_query if refined_query else query
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ # πŸ”Ή Hybrid Search (TF-IDF + Semantic Retrieval)
34
+ def hybrid_search(query, user_groups, index_name="briefmeta", min_score=0.01, fetch_k=50):
35
+ vector_store = PineconeVectorStore(index_name=index_name, embedding=embeddings)
36
+ semantic_results = vector_store.max_marginal_relevance_search(query, k=10, fetch_k=fetch_k)
37
+
38
+ all_texts = [doc.page_content for doc in semantic_results]
39
+ vectorizer = TfidfVectorizer(stop_words="english")
40
+ tfidf_matrix = vectorizer.fit_transform(all_texts)
41
+ query_tfidf = vectorizer.transform([query])
42
+ keyword_scores = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
43
+
44
+ combined_results, seen_ids = [], set()
45
+ for i, doc in enumerate(semantic_results):
46
+ doc_id, doc_groups = doc.metadata.get("id"), doc.metadata.get("groups", [])
47
+ semantic_score = float(doc.metadata.get("score", 0))
48
+ keyword_score = float(keyword_scores[i])
49
+ final_score = 0.7 * semantic_score + 0.3 * keyword_score # Hybrid score
50
+
51
+ if doc_id not in seen_ids and any(group in user_groups for group in doc_groups) and final_score > min_score:
52
+ seen_ids.add(doc_id)
53
+ doc.metadata["final_score"] = final_score
54
+ combined_results.append(doc)
55
+
56
+ combined_results.sort(key=lambda x: x.metadata["final_score"], reverse=True)
57
+ return [
58
+ {
59
+ "doc_id": doc.metadata.get("doc_id", "N/A"),
60
+ "chunk_id": doc.metadata.get("id", "N/A"),
61
+ "title": doc.metadata.get("source", "N/A"),
62
+ "text": doc.page_content,
63
+ "page_number": str(doc.metadata.get("page_number", "N/A")),
64
+ "score": str(doc.metadata.get("final_score", "N/A")),
65
+ }
66
+ for doc in combined_results
67
+ ]
68
+
69
+ # πŸ”Ή Metadata-Weighted Reranking
70
+ def rerank(query, context):
71
+ reranker = pc.inference.rerank(
72
+ model="bge-reranker-v2-m3", query=query, documents=context, top_n=10, return_documents=True
73
  )
 
74
 
75
+ final_reranked = []
76
+ for entry in reranker.data:
77
+ doc, score = entry["document"], float(entry["score"])
78
+ citation_boost = 1.2 if "high_citations" in doc.get("tags", []) else 1.0
79
+ recency_boost = 1.1 if "recent_upload" in doc.get("tags", []) else 1.0
80
+ final_score = score * citation_boost * recency_boost
81
+ doc["final_score"] = final_score
82
+ final_reranked.append(doc)
83
+
84
+ final_reranked.sort(key=lambda x: x["final_score"], reverse=True)
85
+ return final_reranked
86
+
87
+ # πŸ”Ή Intelligent Search Summary Generator
88
+ def generate_search_summary(search_results, query):
89
+ if not search_results:
90
+ return "No relevant documents found. Try refining your query."
91
+
92
+ num_results = len(search_results)
93
+ doc_titles = [doc.get("title", "Unknown Document") for doc in search_results]
94
+ doc_pages = [doc.get("page_number", "N/A") for doc in search_results]
95
+ relevance_scores = [float(doc.get("score", 0)) for doc in search_results]
96
+
97
+ summary_prompt = f"""
98
+ Generate a concise 1-3 sentence summary:
99
+ - User Query: "{query}"
100
+ - Matching Documents: {num_results} found
101
+ - Titles: {", ".join(set(doc_titles))}
102
+ - Pages Referenced: {", ".join(set(doc_pages))}
103
+ - Relevance Scores (0-1): {relevance_scores}
104
+ Provide a clear, user-friendly summary with an action suggestion.
105
+ """
106
+
107
+ llm = ChatOpenAI(model="gpt-4", openai_api_key=openai.api_key, temperature=0.5)
108
+ summary = llm([HumanMessage(content=summary_prompt)]).content.strip()
109
+ return summary if summary else "No intelligent summary available."
110
+
111
+ # πŸ”Ή LLM-based Answer Generation
112
  def generate_output(context, query):
113
+ if not context.strip():
114
+ return "No relevant information found. Try refining your query."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ llm = ChatOpenAI(model="gpt-4", openai_api_key=openai.api_key, temperature=0.5)
117
+ prompt_template = PromptTemplate(
118
+ template="Use the following context to answer the question:\nContext: {context}\nQuestion: {question}\nAnswer:",
119
+ input_variables=["context", "question"],
120
+ )
121
+ prompt = prompt_template.format(context=context, question=query)
122
+ response = llm([HumanMessage(content=prompt)]).content.strip()
123
+ return response if response else "No relevant answer found."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
+ # πŸ”Ή Full Workflow
126
  def complete_workflow(query, user_groups, index_name="briefmeta"):
127
  try:
 
128
  refined_query = expand_query(query)
129
+ context_data = hybrid_search(refined_query, user_groups)
130
+ reranked_results = rerank(refined_query, context_data)
131
+
132
+ context_data = [
133
+ {
134
+ 'chunk_id': doc["chunk_id"],
135
+ 'doc_id': doc["doc_id"],
136
+ 'title': doc["title"],
137
+ 'text': doc["text"],
138
+ 'page_number': str(doc["page_number"]),
139
+ 'score': str(doc["final_score"])
140
+ }
141
+ for doc in reranked_results
142
+ ]
 
 
143
 
144
  document_titles = list({os.path.basename(doc["title"]) for doc in context_data})
145
  formatted_titles = " " + "\n".join(document_titles)
146
+ intelligent_search_summary = generate_search_summary(context_data, refined_query)
 
147
 
148
  results = {
149
  "results": [
150
  {
151
+ "natural_language_output": generate_output(doc["text"], refined_query),
152
  "chunk_id": doc["chunk_id"],
153
  "document_id": doc["doc_id"],
154
  "title": doc["title"],
 
158
  }
159
  for doc in context_data
160
  ],
161
+ "total_results": len(context_data),
162
+ "intelligent_search_summary": intelligent_search_summary
163
  }
164
 
165
+ return results, formatted_titles, intelligent_search_summary
166
+
167
  except Exception as e:
168
+ return {"results": [], "total_results": 0, "intelligent_search_summary": "Error generating summary."}, f"Error in workflow: {str(e)}"
169
 
170
+ # πŸ”Ή Gradio UI
171
  def gradio_app():
172
+ with gr.Blocks() as app:
173
+ gr.Markdown("### πŸ“„ Intelligent Document Search Prototype-v0.2")
174
+ user_query = gr.Textbox(label="πŸ” Enter Search Query")
175
+ user_groups = gr.Textbox(label="πŸ‘₯ User Groups", placeholder="e.g., ['KarthikPersonal']")
176
+ index_name = gr.Textbox(label="πŸ“‚ Index Name", placeholder="Default: briefmeta")
177
+ search_btn = gr.Button("πŸ”Ž Search")
178
+ search_summary = gr.Textbox(label="πŸ“œ Intelligent Search Summary", interactive=False)
179
+ result_output = gr.JSON(label="πŸ“Š Search Results")
180
+ titles_output = gr.Textbox(label="πŸ“‚ Retrieved Document Titles", interactive=False)
181
+
182
+ search_btn.click(complete_workflow, inputs=[user_query, user_groups, index_name], outputs=[result_output, titles_output, search_summary])
 
 
 
 
 
 
 
 
183
 
184
  return app
185
 
186
+ # Launch the App
187
+ gradio_app().launch()