DrishtiSharma commited on
Commit
15ed0e7
Β·
verified Β·
1 Parent(s): be906ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -14
app.py CHANGED
@@ -1,14 +1,16 @@
1
  import streamlit as st
2
  import os
3
  import requests
 
4
  import chromadb
5
  from langchain.document_loaders import PDFPlumberLoader
6
  from langchain_huggingface import HuggingFaceEmbeddings
7
  from langchain_experimental.text_splitter import SemanticChunker
8
  from langchain_chroma import Chroma
9
- from langchain.chains import LLMChain, SequentialChain
10
  from langchain.prompts import PromptTemplate
11
  from langchain_groq import ChatGroq
 
12
  from prompts import rag_prompt, relevancy_prompt, relevant_context_picker_prompt, response_synth
13
 
14
  # ----------------- Streamlit UI Setup -----------------
@@ -18,8 +20,9 @@ st.title("Blah-1")
18
  # ----------------- API Keys -----------------
19
  os.environ["GROQ_API_KEY"] = st.secrets.get("GROQ_API_KEY", "")
20
 
21
- # ----------------- Clear ChromaDB Cache -----------------
22
- chromadb.api.client.SharedSystemClient.clear_system_cache()
 
23
 
24
  # ----------------- Initialize Session State -----------------
25
  if "pdf_loaded" not in st.session_state:
@@ -33,22 +36,41 @@ if "processed_chunks" not in st.session_state:
33
  if "vector_store" not in st.session_state:
34
  st.session_state.vector_store = None
35
 
36
- # ----------------- Load Models -----------------
37
- llm_judge = ChatGroq(model="deepseek-r1-distill-llama-70b")
38
- rag_llm = ChatGroq(model="mixtral-8x7b-32768")
 
 
39
 
40
- # Enable verbose logging for debugging
41
- llm_judge.verbose = True
42
- rag_llm.verbose = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  # ----------------- PDF Selection -----------------
45
- #st.subheader("PDF Selection")
46
  pdf_source = st.radio("Choose a PDF source:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
47
 
48
  if pdf_source == "Upload a PDF file":
49
  uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
50
  if uploaded_file:
51
- st.session_state.pdf_path = "temp.pdf"
52
  with open(st.session_state.pdf_path, "wb") as f:
53
  f.write(uploaded_file.getbuffer())
54
  st.session_state.pdf_loaded = False
@@ -62,7 +84,7 @@ elif pdf_source == "Enter a PDF URL":
62
  try:
63
  response = requests.get(pdf_url)
64
  if response.status_code == 200:
65
- st.session_state.pdf_path = "temp.pdf"
66
  with open(st.session_state.pdf_path, "wb") as f:
67
  f.write(response.content)
68
  st.session_state.pdf_loaded = False
@@ -79,11 +101,20 @@ if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
79
  with st.spinner("πŸ”„ Processing document... Please wait."):
80
  loader = PDFPlumberLoader(st.session_state.pdf_path)
81
  docs = loader.load()
82
- st.json(docs[0].metadata)
 
 
 
 
 
 
 
 
 
83
 
84
  # Embedding Model
85
  model_name = "nomic-ai/modernbert-embed-base"
86
- embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"}, encode_kwargs = {'normalize_embeddings': False})
87
 
88
  # Prevent unnecessary re-chunking
89
  if not st.session_state.chunked:
@@ -99,6 +130,7 @@ if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
99
  if not st.session_state.vector_created and st.session_state.processed_chunks:
100
  with st.spinner("πŸ”„ Initializing Vector Store..."):
101
  st.session_state.vector_store = Chroma(
 
102
  collection_name="deepseek_collection",
103
  collection_metadata={"hnsw:space": "cosine"},
104
  embedding_function=embedding_model
 
1
  import streamlit as st
2
  import os
3
  import requests
4
+ import pdfplumber
5
  import chromadb
6
  from langchain.document_loaders import PDFPlumberLoader
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
  from langchain_experimental.text_splitter import SemanticChunker
9
  from langchain_chroma import Chroma
10
+ from langchain.chains import LLMChain
11
  from langchain.prompts import PromptTemplate
12
  from langchain_groq import ChatGroq
13
+ import re
14
  from prompts import rag_prompt, relevancy_prompt, relevant_context_picker_prompt, response_synth
15
 
16
  # ----------------- Streamlit UI Setup -----------------
 
20
  # ----------------- API Keys -----------------
21
  os.environ["GROQ_API_KEY"] = st.secrets.get("GROQ_API_KEY", "")
22
 
23
+ # ----------------- ChromaDB Persistent Directory -----------------
24
+ CHROMA_DB_DIR = "/mnt/data/chroma_db" # Hugging Face Spaces persistent storage
25
+ os.makedirs(CHROMA_DB_DIR, exist_ok=True)
26
 
27
  # ----------------- Initialize Session State -----------------
28
  if "pdf_loaded" not in st.session_state:
 
36
  if "vector_store" not in st.session_state:
37
  st.session_state.vector_store = None
38
 
39
+ # ----------------- Extract Metadata (Title, Author, Emails, Affiliations) -----------------
40
+ def extract_metadata(pdf_path):
41
+ """Extract metadata such as Title, Author, Emails, and Affiliations."""
42
+ with pdfplumber.open(pdf_path) as pdf:
43
+ metadata = pdf.metadata or {}
44
 
45
+ # Extract title
46
+ title = metadata.get("Title", "").strip()
47
+ if not title and pdf.pages:
48
+ text = pdf.pages[0].extract_text()
49
+ title = text.split("\n")[0] if text else "Untitled Document"
50
+
51
+ # Extract author
52
+ author = metadata.get("Author", "").strip()
53
+ if not author and pdf.pages:
54
+ author_matches = re.findall(r"By ([A-Za-z\s,]+)", pdf.pages[0].extract_text() or "")
55
+ author = author_matches[0] if author_matches else "Unknown Author"
56
+
57
+ # Extract emails
58
+ emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", pdf.pages[0].extract_text() or "")
59
+ email_str = ", ".join(emails) if emails else "No emails found"
60
+
61
+ # Extract affiliations
62
+ affiliations = re.findall(r"(?:Department|Faculty|Institute|University|College|School)\s+[\w\s]+", pdf.pages[0].extract_text() or "")
63
+ affiliation_str = ", ".join(affiliations) if affiliations else "No affiliations found"
64
+
65
+ return title, author, email_str, affiliation_str
66
 
67
  # ----------------- PDF Selection -----------------
 
68
  pdf_source = st.radio("Choose a PDF source:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
69
 
70
  if pdf_source == "Upload a PDF file":
71
  uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
72
  if uploaded_file:
73
+ st.session_state.pdf_path = "/mnt/data/temp.pdf"
74
  with open(st.session_state.pdf_path, "wb") as f:
75
  f.write(uploaded_file.getbuffer())
76
  st.session_state.pdf_loaded = False
 
84
  try:
85
  response = requests.get(pdf_url)
86
  if response.status_code == 200:
87
+ st.session_state.pdf_path = "/mnt/data/temp.pdf"
88
  with open(st.session_state.pdf_path, "wb") as f:
89
  f.write(response.content)
90
  st.session_state.pdf_loaded = False
 
101
  with st.spinner("πŸ”„ Processing document... Please wait."):
102
  loader = PDFPlumberLoader(st.session_state.pdf_path)
103
  docs = loader.load()
104
+
105
+ # Extract metadata
106
+ title, author, email_str, affiliation_str = extract_metadata(st.session_state.pdf_path)
107
+
108
+ # Display extracted metadata
109
+ st.subheader("πŸ“„ Extracted Document Metadata")
110
+ st.write(f"**Title:** {title}")
111
+ st.write(f"**Author:** {author}")
112
+ st.write(f"**Emails:** {email_str}")
113
+ st.write(f"**Affiliations:** {affiliation_str}")
114
 
115
  # Embedding Model
116
  model_name = "nomic-ai/modernbert-embed-base"
117
+ embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False})
118
 
119
  # Prevent unnecessary re-chunking
120
  if not st.session_state.chunked:
 
130
  if not st.session_state.vector_created and st.session_state.processed_chunks:
131
  with st.spinner("πŸ”„ Initializing Vector Store..."):
132
  st.session_state.vector_store = Chroma(
133
+ persist_directory=CHROMA_DB_DIR, # <-- Ensures persistence
134
  collection_name="deepseek_collection",
135
  collection_metadata={"hnsw:space": "cosine"},
136
  embedding_function=embedding_model