DrishtiSharma commited on
Commit
412e4a3
Β·
verified Β·
1 Parent(s): 006ee88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -26
app.py CHANGED
@@ -3,6 +3,7 @@ import os
3
  import requests
4
  import pdfplumber
5
  import chromadb
 
6
  from langchain.document_loaders import PDFPlumberLoader
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
  from langchain_experimental.text_splitter import SemanticChunker
@@ -10,7 +11,6 @@ from langchain_chroma import Chroma
10
  from langchain.chains import LLMChain
11
  from langchain.prompts import PromptTemplate
12
  from langchain_groq import ChatGroq
13
- import re
14
  from prompts import rag_prompt, relevancy_prompt, relevant_context_picker_prompt, response_synth
15
 
16
  # ----------------- Streamlit UI Setup -----------------
@@ -21,7 +21,7 @@ st.title("Blah-1")
21
  os.environ["GROQ_API_KEY"] = st.secrets.get("GROQ_API_KEY", "")
22
 
23
  # ----------------- ChromaDB Persistent Directory -----------------
24
- CHROMA_DB_DIR = "/mnt/data/chroma_db"
25
  os.makedirs(CHROMA_DB_DIR, exist_ok=True)
26
 
27
  # ----------------- Initialize Session State -----------------
@@ -36,9 +36,9 @@ if "processed_chunks" not in st.session_state:
36
  if "vector_store" not in st.session_state:
37
  st.session_state.vector_store = None
38
 
39
- # ----------------- Extract Metadata (Title, Author, Emails, Affiliations) -----------------
40
  def extract_metadata(pdf_path):
41
- """Extract metadata such as Title, Author, Emails, and Affiliations."""
42
  with pdfplumber.open(pdf_path) as pdf:
43
  metadata = pdf.metadata or {}
44
 
@@ -46,13 +46,14 @@ def extract_metadata(pdf_path):
46
  title = metadata.get("Title", "").strip()
47
  if not title and pdf.pages:
48
  text = pdf.pages[0].extract_text()
49
- title = text.split("\n")[0] if text else "Untitled Document"
 
50
 
51
  # Extract author
52
  author = metadata.get("Author", "").strip()
53
  if not author and pdf.pages:
54
- author_matches = re.findall(r"By ([A-Za-z\s,]+)", pdf.pages[0].extract_text() or "")
55
- author = author_matches[0] if author_matches else "Unknown Author"
56
 
57
  # Extract emails
58
  emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", pdf.pages[0].extract_text() or "")
@@ -77,25 +78,6 @@ if pdf_source == "Upload a PDF file":
77
  st.session_state.chunked = False
78
  st.session_state.vector_created = False
79
 
80
- elif pdf_source == "Enter a PDF URL":
81
- pdf_url = st.text_input("Enter PDF URL:")
82
- if pdf_url and not st.session_state.pdf_loaded:
83
- with st.spinner("πŸ”„ Downloading PDF..."):
84
- try:
85
- response = requests.get(pdf_url)
86
- if response.status_code == 200:
87
- st.session_state.pdf_path = "/mnt/data/temp.pdf"
88
- with open(st.session_state.pdf_path, "wb") as f:
89
- f.write(response.content)
90
- st.session_state.pdf_loaded = False
91
- st.session_state.chunked = False
92
- st.session_state.vector_created = False
93
- st.success("βœ… PDF Downloaded Successfully!")
94
- else:
95
- st.error("❌ Failed to download PDF. Check the URL.")
96
- except Exception as e:
97
- st.error(f"Error downloading PDF: {e}")
98
-
99
  # ----------------- Process PDF -----------------
100
  if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
101
  with st.spinner("πŸ”„ Processing document... Please wait."):
@@ -117,10 +99,15 @@ if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
117
  model_name = "nomic-ai/modernbert-embed-base"
118
  embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False})
119
 
 
 
 
 
120
  # Prevent unnecessary re-chunking
121
  if not st.session_state.chunked:
122
  text_splitter = SemanticChunker(embedding_model)
123
  document_chunks = text_splitter.split_documents(docs)
 
124
  st.session_state.processed_chunks = document_chunks
125
  st.session_state.chunked = True
126
 
@@ -140,6 +127,7 @@ if not st.session_state.vector_created and st.session_state.processed_chunks:
140
  st.session_state.vector_created = True
141
  st.success("βœ… Vector store initialized successfully!")
142
 
 
143
  # ----------------- Query Input -----------------
144
  query = st.text_input("πŸ” Ask a question about the document:")
145
 
 
3
  import requests
4
  import pdfplumber
5
  import chromadb
6
+ import re
7
  from langchain.document_loaders import PDFPlumberLoader
8
  from langchain_huggingface import HuggingFaceEmbeddings
9
  from langchain_experimental.text_splitter import SemanticChunker
 
11
  from langchain.chains import LLMChain
12
  from langchain.prompts import PromptTemplate
13
  from langchain_groq import ChatGroq
 
14
  from prompts import rag_prompt, relevancy_prompt, relevant_context_picker_prompt, response_synth
15
 
16
  # ----------------- Streamlit UI Setup -----------------
 
21
  os.environ["GROQ_API_KEY"] = st.secrets.get("GROQ_API_KEY", "")
22
 
23
  # ----------------- ChromaDB Persistent Directory -----------------
24
+ CHROMA_DB_DIR = "/mnt/data/chroma_db" # Ensure persistence
25
  os.makedirs(CHROMA_DB_DIR, exist_ok=True)
26
 
27
  # ----------------- Initialize Session State -----------------
 
36
  if "vector_store" not in st.session_state:
37
  st.session_state.vector_store = None
38
 
39
+ # ----------------- Improved Metadata Extraction -----------------
40
  def extract_metadata(pdf_path):
41
+ """Extracts title, author, emails, and affiliations from PDF."""
42
  with pdfplumber.open(pdf_path) as pdf:
43
  metadata = pdf.metadata or {}
44
 
 
46
  title = metadata.get("Title", "").strip()
47
  if not title and pdf.pages:
48
  text = pdf.pages[0].extract_text()
49
+ title_match = re.search(r"(?i)title[:\-]?\s*(.*)", text or "")
50
+ title = title_match.group(1) if title_match else text.split("\n")[0] if text else "Untitled Document"
51
 
52
  # Extract author
53
  author = metadata.get("Author", "").strip()
54
  if not author and pdf.pages:
55
+ author_match = re.search(r"(?i)by\s+([A-Za-z\s,]+)", pdf.pages[0].extract_text() or "")
56
+ author = author_match.group(1).strip() if author_match else "Unknown Author"
57
 
58
  # Extract emails
59
  emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", pdf.pages[0].extract_text() or "")
 
78
  st.session_state.chunked = False
79
  st.session_state.vector_created = False
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  # ----------------- Process PDF -----------------
82
  if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
83
  with st.spinner("πŸ”„ Processing document... Please wait."):
 
99
  model_name = "nomic-ai/modernbert-embed-base"
100
  embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False})
101
 
102
+ # Convert metadata into a retrievable chunk
103
+ metadata_text = f"Title: {title}\nAuthor: {author}\nEmails: {email_str}\nAffiliations: {affiliation_str}"
104
+ metadata_doc = {"page_content": metadata_text, "metadata": {"source": "metadata"}}
105
+
106
  # Prevent unnecessary re-chunking
107
  if not st.session_state.chunked:
108
  text_splitter = SemanticChunker(embedding_model)
109
  document_chunks = text_splitter.split_documents(docs)
110
+ document_chunks.insert(0, metadata_doc) # Insert metadata as a retrievable document
111
  st.session_state.processed_chunks = document_chunks
112
  st.session_state.chunked = True
113
 
 
127
  st.session_state.vector_created = True
128
  st.success("βœ… Vector store initialized successfully!")
129
 
130
+
131
  # ----------------- Query Input -----------------
132
  query = st.text_input("πŸ” Ask a question about the document:")
133