DrishtiSharma commited on
Commit
6d584d6
·
verified ·
1 Parent(s): 08e5756

Delete lab/title_issue_attempt1.py

Browse files
Files changed (1) hide show
  1. lab/title_issue_attempt1.py +0 -252
lab/title_issue_attempt1.py DELETED
@@ -1,252 +0,0 @@
1
- import streamlit as st
2
- import os
3
- import json
4
- import requests
5
- import pdfplumber
6
- import chromadb
7
- import re
8
- from langchain.document_loaders import PDFPlumberLoader
9
- from langchain_huggingface import HuggingFaceEmbeddings
10
- from langchain_experimental.text_splitter import SemanticChunker
11
- from langchain_chroma import Chroma
12
- from langchain.chains import LLMChain
13
- from langchain.prompts import PromptTemplate
14
- from langchain_groq import ChatGroq
15
- from prompts import rag_prompt, relevancy_prompt, relevant_context_picker_prompt, response_synth
16
-
17
- # ----------------- Streamlit UI Setup -----------------
18
- st.set_page_config(page_title="Blah-1", layout="centered")
19
-
20
- # ----------------- API Keys -----------------
21
- os.environ["GROQ_API_KEY"] = st.secrets.get("GROQ_API_KEY", "")
22
-
23
- # Load LLM models
24
- llm_judge = ChatGroq(model="deepseek-r1-distill-llama-70b")
25
- rag_llm = ChatGroq(model="mixtral-8x7b-32768")
26
-
27
- llm_judge.verbose = True
28
- rag_llm.verbose = True
29
-
30
- # Clear ChromaDB cache to fix tenant issue
31
- chromadb.api.client.SharedSystemClient.clear_system_cache()
32
-
33
-
34
- # ----------------- ChromaDB Persistent Directory -----------------
35
- CHROMA_DB_DIR = "/mnt/data/chroma_db"
36
- os.makedirs(CHROMA_DB_DIR, exist_ok=True)
37
-
38
- # ----------------- Initialize Session State -----------------
39
- if "pdf_loaded" not in st.session_state:
40
- st.session_state.pdf_loaded = False
41
- if "chunked" not in st.session_state:
42
- st.session_state.chunked = False
43
- if "vector_created" not in st.session_state:
44
- st.session_state.vector_created = False
45
- if "processed_chunks" not in st.session_state:
46
- st.session_state.processed_chunks = None
47
- if "vector_store" not in st.session_state:
48
- st.session_state.vector_store = None
49
-
50
-
51
- # ----------------- Text Cleaning Functions -----------------
52
- def clean_extracted_text(text):
53
- """
54
- Cleans extracted PDF text by removing excessive line breaks, fixing spacing issues, and resolving OCR artifacts.
55
- """
56
- text = re.sub(r'\n+', '\n', text) # Remove excessive newlines
57
- text = re.sub(r'\s{2,}', ' ', text) # Remove extra spaces
58
- text = re.sub(r'(\w)-\n(\w)', r'\1\2', text) # Fix hyphenated words split by a newline
59
- return text.strip()
60
-
61
- def extract_title_manually(text):
62
- """
63
- Attempts to find the title by checking the first few lines.
64
- - Titles are usually long enough (more than 5 words).
65
- - Ignores common header text like "Abstract", "Introduction".
66
- """
67
- lines = text.split("\n")
68
- ignore_keywords = ["abstract", "introduction", "keywords", "contents", "table", "figure"]
69
-
70
- for line in lines[:5]: # Check only the first 5 lines
71
- clean_line = line.strip()
72
- if len(clean_line.split()) > 5 and not any(word.lower() in clean_line.lower() for word in ignore_keywords):
73
- return clean_line # Return first valid title
74
- return "Unknown"
75
-
76
- # ----------------- Metadata Extraction -----------------
77
- # ----------------- Metadata Extraction -----------------
78
- def extract_metadata(pdf_path):
79
- """Extracts metadata using simple heuristics without LLM."""
80
-
81
- with pdfplumber.open(pdf_path) as pdf:
82
- if not pdf.pages:
83
- return {
84
- "Title": "Unknown",
85
- "Author": "Unknown",
86
- "Emails": "No emails found",
87
- "Affiliations": "No affiliations found"
88
- }
89
-
90
- # Extract text from the first page
91
- first_page_text = pdf.pages[0].extract_text() or "No text found."
92
- cleaned_text = clean_extracted_text(first_page_text)
93
-
94
- # Extract Title
95
- pre_extracted_title = extract_title_manually(cleaned_text)
96
-
97
- # Extract Authors (Names typically appear before affiliations)
98
- author_pattern = re.compile(r"([\w\-\s]+,\s?)+[\w\-\s]+")
99
- authors = "Unknown"
100
- for line in cleaned_text.split("\n"):
101
- match = author_pattern.search(line)
102
- if match:
103
- authors = match.group(0)
104
- break
105
-
106
- # Extract Emails
107
- email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
108
- emails = ", ".join(email_pattern.findall(cleaned_text)) or "No emails found"
109
-
110
- # Extract Affiliations (usually below author names)
111
- affiliations = "Unknown"
112
- for i, line in enumerate(cleaned_text.split("\n")):
113
- if "@" in line: # Email appears before affiliations
114
- affiliations = cleaned_text.split("\n")[i + 1] if i + 1 < len(cleaned_text.split("\n")) else "Unknown"
115
- break
116
-
117
- return {
118
- "Title": pre_extracted_title,
119
- "Author": authors,
120
- "Emails": emails,
121
- "Affiliations": affiliations
122
- }
123
-
124
-
125
- # ----------------- Step 1: Choose PDF Source -----------------
126
- pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
127
-
128
- if pdf_source == "Upload a PDF file":
129
- uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
130
- if uploaded_file:
131
- st.session_state.pdf_path = "/mnt/data/temp.pdf"
132
- with open(st.session_state.pdf_path, "wb") as f:
133
- f.write(uploaded_file.getbuffer())
134
- st.session_state.pdf_loaded = False
135
- st.session_state.chunked = False
136
- st.session_state.vector_created = False
137
-
138
- elif pdf_source == "Enter a PDF URL":
139
- pdf_url = st.text_input("Enter PDF URL:")
140
- if pdf_url and not st.session_state.pdf_loaded:
141
- with st.spinner("🔄 Downloading PDF..."):
142
- try:
143
- response = requests.get(pdf_url)
144
- if response.status_code == 200:
145
- st.session_state.pdf_path = "/mnt/data/temp.pdf"
146
- with open(st.session_state.pdf_path, "wb") as f:
147
- f.write(response.content)
148
- st.session_state.pdf_loaded = False
149
- st.session_state.chunked = False
150
- st.session_state.vector_created = False
151
- st.success("✅ PDF Downloaded Successfully!")
152
- else:
153
- st.error("❌ Failed to download PDF. Check the URL.")
154
- except Exception as e:
155
- st.error(f"Error downloading PDF: {e}")
156
-
157
-
158
- # ----------------- Process PDF -----------------
159
- if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
160
- with st.spinner("🔄 Processing document... Please wait."):
161
- loader = PDFPlumberLoader(st.session_state.pdf_path)
162
- docs = loader.load()
163
- st.json(docs[0].metadata)
164
-
165
- # Extract metadata
166
- metadata = extract_metadata(st.session_state.pdf_path)
167
-
168
- # Display extracted-metadata
169
- if isinstance(metadata, dict):
170
- st.subheader("📄 Extracted Document Metadata")
171
- st.write(f"**Title:** {metadata.get('Title', 'Unknown')}")
172
- st.write(f"**Author:** {metadata.get('Author', 'Unknown')}")
173
- st.write(f"**Emails:** {metadata.get('Emails', 'No emails found')}")
174
- st.write(f"**Affiliations:** {metadata.get('Affiliations', 'No affiliations found')}")
175
- else:
176
- st.error("Metadata extraction failed.")
177
-
178
- # Embedding Model
179
- model_name = "nomic-ai/modernbert-embed-base"
180
- embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False})
181
-
182
- # Convert metadata into a retrievable chunk
183
- metadata_doc = {"page_content": metadata, "metadata": {"source": "metadata"}}
184
-
185
-
186
- # Prevent unnecessary re-chunking
187
- if not st.session_state.chunked:
188
- text_splitter = SemanticChunker(embedding_model)
189
- document_chunks = text_splitter.split_documents(docs)
190
- document_chunks.insert(0, metadata_doc) # Insert metadata as a retrievable document
191
- st.session_state.processed_chunks = document_chunks
192
- st.session_state.chunked = True
193
-
194
- st.session_state.pdf_loaded = True
195
- st.success("✅ Document processed and chunked successfully!")
196
-
197
- # ----------------- Setup Vector Store -----------------
198
- if not st.session_state.vector_created and st.session_state.processed_chunks:
199
- with st.spinner("🔄 Initializing Vector Store..."):
200
- st.session_state.vector_store = Chroma(
201
- persist_directory=CHROMA_DB_DIR, # <-- Ensures persistence
202
- collection_name="deepseek_collection",
203
- collection_metadata={"hnsw:space": "cosine"},
204
- embedding_function=embedding_model
205
- )
206
- st.session_state.vector_store.add_documents(st.session_state.processed_chunks)
207
- st.session_state.vector_created = True
208
- st.success("✅ Vector store initialized successfully!")
209
-
210
-
211
- # ----------------- Query Input -----------------
212
- query = st.text_input("🔍 Ask a question about the document:")
213
-
214
- if query:
215
- with st.spinner("🔄 Retrieving relevant context..."):
216
- retriever = st.session_state.vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
217
- retrieved_docs = retriever.invoke(query)
218
- context = [d.page_content for d in retrieved_docs]
219
- st.success("✅ Context retrieved successfully!")
220
-
221
- # ----------------- Run Individual Chains Explicitly -----------------
222
- context_relevancy_chain = LLMChain(llm=llm_judge, prompt=PromptTemplate(input_variables=["retriever_query", "context"], template=relevancy_prompt), output_key="relevancy_response")
223
- relevant_context_chain = LLMChain(llm=llm_judge, prompt=PromptTemplate(input_variables=["relevancy_response"], template=relevant_context_picker_prompt), output_key="context_number")
224
- relevant_contexts_chain = LLMChain(llm=llm_judge, prompt=PromptTemplate(input_variables=["context_number", "context"], template=response_synth), output_key="relevant_contexts")
225
- response_chain = LLMChain(llm=rag_llm, prompt=PromptTemplate(input_variables=["query", "context"], template=rag_prompt), output_key="final_response")
226
-
227
- response_crisis = context_relevancy_chain.invoke({"context": context, "retriever_query": query})
228
- relevant_response = relevant_context_chain.invoke({"relevancy_response": response_crisis["relevancy_response"]})
229
- contexts = relevant_contexts_chain.invoke({"context_number": relevant_response["context_number"], "context": context})
230
- final_response = response_chain.invoke({"query": query, "context": contexts["relevant_contexts"]})
231
-
232
- # ----------------- Display All Outputs -----------------
233
- st.markdown("### Context Relevancy Evaluation")
234
- st.json(response_crisis["relevancy_response"])
235
-
236
- st.markdown("### Picked Relevant Contexts")
237
- st.json(relevant_response["context_number"])
238
-
239
- st.markdown("### Extracted Relevant Contexts")
240
- st.json(contexts["relevant_contexts"])
241
-
242
- st.subheader("context_relevancy_evaluation_chain Statement")
243
- st.json(final_response["relevancy_response"])
244
-
245
- st.subheader("pick_relevant_context_chain Statement")
246
- st.json(final_response["context_number"])
247
-
248
- st.subheader("relevant_contexts_chain Statement")
249
- st.json(final_response["relevant_contexts"])
250
-
251
- st.subheader("RAG Response Statement")
252
- st.json(final_response["final_response"])