tony-42069 commited on
Commit
fbfbbd7
·
1 Parent(s): 300ee92

Add main application files

Browse files
Files changed (4) hide show
  1. app.py +116 -0
  2. pdf_processor.py +42 -0
  3. rag_engine.py +112 -0
  4. streamlit_app.py +43 -0
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile
3
+ import os
4
+ from pdf_processor import PDFProcessor
5
+ from rag_engine import RAGEngine
6
+
7
+ # Initialize session state
8
+ if 'rag_engine' not in st.session_state:
9
+ try:
10
+ st.session_state.rag_engine = RAGEngine()
11
+ except ValueError as e:
12
+ st.error(f"Configuration Error: {str(e)}")
13
+ st.stop()
14
+ except ConnectionError as e:
15
+ st.error(f"Connection Error: {str(e)}")
16
+ st.stop()
17
+ except Exception as e:
18
+ st.error(f"Unexpected Error: {str(e)}")
19
+ st.stop()
20
+
21
+ if 'processed_file' not in st.session_state:
22
+ st.session_state.processed_file = False
23
+
24
+ # Page config
25
+ st.set_page_config(page_title="Concept Definition Chatbot", layout="wide")
26
+ st.title("Concept Definition Chatbot")
27
+
28
+ # Sidebar for PDF upload
29
+ with st.sidebar:
30
+ st.header("Upload PDF")
31
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
32
+
33
+ if uploaded_file is not None and not st.session_state.processed_file:
34
+ with st.spinner("Processing PDF..."):
35
+ try:
36
+ # Save uploaded file temporarily
37
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
38
+ tmp_file.write(uploaded_file.getvalue())
39
+ tmp_path = tmp_file.name
40
+
41
+ # Process PDF
42
+ processor = PDFProcessor()
43
+ chunks = processor.process_pdf(tmp_path)
44
+
45
+ # Initialize RAG engine
46
+ st.session_state.rag_engine.initialize_vector_store(chunks)
47
+ st.session_state.processed_file = True
48
+
49
+ # Clean up
50
+ os.unlink(tmp_path)
51
+ except ValueError as e:
52
+ st.error(f"Configuration Error: {str(e)}")
53
+ st.stop()
54
+ except ConnectionError as e:
55
+ st.error(f"Connection Error: {str(e)}")
56
+ st.stop()
57
+ except Exception as e:
58
+ st.error(f"Unexpected Error: {str(e)}")
59
+ st.stop()
60
+ st.success("PDF processed successfully!")
61
+
62
+ # Main chat interface
63
+ if st.session_state.processed_file:
64
+ # Initialize chat history
65
+ if "messages" not in st.session_state:
66
+ st.session_state.messages = []
67
+
68
+ # Display chat messages
69
+ for message in st.session_state.messages:
70
+ with st.chat_message(message["role"]):
71
+ st.markdown(message["content"])
72
+ if "sources" in message:
73
+ with st.expander("View Sources"):
74
+ for source in message["sources"]:
75
+ st.markdown(f"**Page {source['page']}:**\n{source['text']}")
76
+
77
+ # Chat input
78
+ if prompt := st.chat_input("Ask a question about the concepts in your PDF"):
79
+ # Add user message to chat history
80
+ st.session_state.messages.append({"role": "user", "content": prompt})
81
+
82
+ # Display user message
83
+ with st.chat_message("user"):
84
+ st.markdown(prompt)
85
+
86
+ # Get bot response
87
+ with st.chat_message("assistant"):
88
+ with st.spinner("Thinking..."):
89
+ try:
90
+ response = st.session_state.rag_engine.answer_question(prompt)
91
+
92
+ # Display response
93
+ st.markdown(response["answer"])
94
+
95
+ # Display sources in expander
96
+ with st.expander("View Sources"):
97
+ for source in response["sources"]:
98
+ st.markdown(f"**Page {source['page']}:**\n{source['text']}")
99
+
100
+ # Add assistant response to chat history
101
+ st.session_state.messages.append({
102
+ "role": "assistant",
103
+ "content": response["answer"],
104
+ "sources": response["sources"]
105
+ })
106
+ except ValueError as e:
107
+ st.error(f"Configuration Error: {str(e)}")
108
+ st.stop()
109
+ except ConnectionError as e:
110
+ st.error(f"Connection Error: {str(e)}")
111
+ st.stop()
112
+ except Exception as e:
113
+ st.error(f"Unexpected Error: {str(e)}")
114
+ st.stop()
115
+ else:
116
+ st.info("Please upload a PDF file to start chatting.")
pdf_processor.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict
2
+ from langchain.document_loaders import PyPDFLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+
5
+ class PDFProcessor:
6
+ def __init__(self):
7
+ self.text_splitter = RecursiveCharacterTextSplitter(
8
+ chunk_size=1000,
9
+ chunk_overlap=200,
10
+ length_function=len,
11
+ separators=["\n\n", "\n", " ", ""]
12
+ )
13
+
14
+ def process_pdf(self, pdf_path: str) -> List[Dict]:
15
+ """
16
+ Process a PDF file and return chunks of text with metadata.
17
+
18
+ Args:
19
+ pdf_path (str): Path to the PDF file
20
+
21
+ Returns:
22
+ List[Dict]: List of dictionaries containing text chunks and metadata
23
+ """
24
+ # Load PDF
25
+ loader = PyPDFLoader(pdf_path)
26
+ pages = loader.load()
27
+
28
+ # Split text into chunks
29
+ chunks = self.text_splitter.split_documents(pages)
30
+
31
+ # Format chunks with metadata
32
+ processed_chunks = []
33
+ for chunk in chunks:
34
+ processed_chunks.append({
35
+ 'text': chunk.page_content,
36
+ 'metadata': {
37
+ 'page': chunk.metadata.get('page', 0) + 1,
38
+ 'source': pdf_path
39
+ }
40
+ })
41
+
42
+ return processed_chunks
rag_engine.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, Dict
3
+ from dotenv import load_dotenv
4
+ import chromadb
5
+ from langchain.embeddings import AzureOpenAIEmbeddings
6
+ from langchain.vectorstores import Chroma
7
+ from langchain.chat_models import AzureChatOpenAI
8
+ from langchain.chains import RetrievalQA
9
+ import time
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+ class RAGEngine:
15
+ def __init__(self):
16
+ # Verify Azure OpenAI settings are set
17
+ required_vars = [
18
+ 'AZURE_OPENAI_ENDPOINT',
19
+ 'AZURE_OPENAI_KEY',
20
+ 'AZURE_OPENAI_DEPLOYMENT_NAME',
21
+ 'AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'
22
+ ]
23
+
24
+ missing_vars = [var for var in required_vars if not os.getenv(var)]
25
+ if missing_vars:
26
+ raise ValueError(f"Missing required Azure OpenAI settings: {', '.join(missing_vars)}")
27
+
28
+ # Initialize with retry mechanism
29
+ max_retries = 3
30
+ for attempt in range(max_retries):
31
+ try:
32
+ self.embeddings = AzureOpenAIEmbeddings(
33
+ azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
34
+ azure_deployment=os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'),
35
+ api_key=os.getenv('AZURE_OPENAI_KEY')
36
+ )
37
+ self.vector_store = None
38
+ self.qa_chain = None
39
+ # Test connection
40
+ self.embeddings.embed_query("test")
41
+ break
42
+ except Exception as e:
43
+ if attempt == max_retries - 1:
44
+ raise ConnectionError(f"Failed to connect to Azure OpenAI API after {max_retries} attempts. Error: {str(e)}")
45
+ time.sleep(2) # Wait before retrying
46
+
47
+ def initialize_vector_store(self, chunks: List[Dict]):
48
+ """
49
+ Initialize the vector store with document chunks.
50
+
51
+ Args:
52
+ chunks (List[Dict]): List of dictionaries containing text and metadata
53
+ """
54
+ texts = [chunk['text'] for chunk in chunks]
55
+ metadatas = [chunk['metadata'] for chunk in chunks]
56
+
57
+ # Create vector store
58
+ self.vector_store = Chroma.from_texts(
59
+ texts=texts,
60
+ embedding=self.embeddings,
61
+ metadatas=metadatas
62
+ )
63
+
64
+ # Initialize QA chain
65
+ llm = AzureChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", azure_deployment_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME'), azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'), api_key=os.getenv('AZURE_OPENAI_KEY'))
66
+ self.qa_chain = RetrievalQA.from_chain_type(
67
+ llm=llm,
68
+ chain_type="stuff",
69
+ retriever=self.vector_store.as_retriever(
70
+ search_kwargs={"k": 3}
71
+ )
72
+ )
73
+
74
+ def answer_question(self, question: str) -> Dict:
75
+ """
76
+ Answer a question using the RAG system.
77
+
78
+ Args:
79
+ question (str): User's question
80
+
81
+ Returns:
82
+ Dict: Answer and source information
83
+ """
84
+ if not self.qa_chain:
85
+ raise ValueError("Vector store not initialized. Please process documents first.")
86
+
87
+ # Create a prompt that emphasizes definition extraction
88
+ prompt = f"""
89
+ Question: {question}
90
+ Please provide a clear and concise answer based on the provided context.
91
+ If the question asks for a definition or explanation of a concept,
92
+ make sure to provide that specifically. Include relevant examples or
93
+ additional context only if they help clarify the concept.
94
+ """
95
+
96
+ # Get answer from QA chain
97
+ result = self.qa_chain({"query": prompt})
98
+
99
+ # Get source documents
100
+ source_docs = self.vector_store.similarity_search(question, k=2)
101
+ sources = [
102
+ {
103
+ 'page': doc.metadata['page'],
104
+ 'text': doc.page_content[:200] + "..." # Preview of source text
105
+ }
106
+ for doc in source_docs
107
+ ]
108
+
109
+ return {
110
+ 'answer': result['result'],
111
+ 'sources': sources
112
+ }
streamlit_app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from pdf_processor import PDFProcessor
5
+ from rag_engine import RAGEngine
6
+ from app.config import AZURE_OPENAI_DEPLOYMENT_NAME
7
+
8
+ # Load environment variables
9
+ load_dotenv()
10
+
11
+ # Initialize components
12
+ pdf_processor = PDFProcessor()
13
+ rag_engine = RAGEngine(deployment_name=AZURE_OPENAI_DEPLOYMENT_NAME)
14
+
15
+ def main():
16
+ st.set_page_config(
17
+ page_title="CRE Knowledge Assistant",
18
+ page_icon="🤖",
19
+ layout="wide"
20
+ )
21
+
22
+ st.title("CRE Knowledge Assistant 🏢")
23
+
24
+ # File uploader
25
+ uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
26
+
27
+ if uploaded_file:
28
+ try:
29
+ # Process the PDF
30
+ pdf_processor.process(uploaded_file)
31
+ st.success("PDF processed successfully! You can now ask questions about it.")
32
+
33
+ # Show chat interface
34
+ user_question = st.text_input("Ask a question about the document:")
35
+ if user_question:
36
+ response = rag_engine.get_response(user_question)
37
+ st.write("Answer:", response)
38
+
39
+ except Exception as e:
40
+ st.error(f"Error processing PDF: {str(e)}")
41
+
42
+ if __name__ == "__main__":
43
+ main()