Spaces:

tony-42069
/

cre-chatbot-rag

Sleeping

App Files Files Community

tony-42069 commited on Nov 27, 2024

Commit

fbfbbd7

1 Parent(s): 300ee92

Add main application files

Browse files

Files changed (4) hide show

app.py +116 -0
pdf_processor.py +42 -0
rag_engine.py +112 -0
streamlit_app.py +43 -0

app.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import streamlit as st
+import tempfile
+import os
+from pdf_processor import PDFProcessor
+from rag_engine import RAGEngine
+# Initialize session state
+if 'rag_engine' not in st.session_state:
+    try:
+        st.session_state.rag_engine = RAGEngine()
+    except ValueError as e:
+        st.error(f"Configuration Error: {str(e)}")
+        st.stop()
+    except ConnectionError as e:
+        st.error(f"Connection Error: {str(e)}")
+        st.stop()
+    except Exception as e:
+        st.error(f"Unexpected Error: {str(e)}")
+        st.stop()
+if 'processed_file' not in st.session_state:
+    st.session_state.processed_file = False
+# Page config
+st.set_page_config(page_title="Concept Definition Chatbot", layout="wide")
+st.title("Concept Definition Chatbot")
+# Sidebar for PDF upload
+with st.sidebar:
+    st.header("Upload PDF")
+    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+    if uploaded_file is not None and not st.session_state.processed_file:
+        with st.spinner("Processing PDF..."):
+            try:
+                # Save uploaded file temporarily
+                with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+                    tmp_file.write(uploaded_file.getvalue())
+                    tmp_path = tmp_file.name
+                # Process PDF
+                processor = PDFProcessor()
+                chunks = processor.process_pdf(tmp_path)
+                # Initialize RAG engine
+                st.session_state.rag_engine.initialize_vector_store(chunks)
+                st.session_state.processed_file = True
+                # Clean up
+                os.unlink(tmp_path)
+            except ValueError as e:
+                st.error(f"Configuration Error: {str(e)}")
+                st.stop()
+            except ConnectionError as e:
+                st.error(f"Connection Error: {str(e)}")
+                st.stop()
+            except Exception as e:
+                st.error(f"Unexpected Error: {str(e)}")
+                st.stop()
+        st.success("PDF processed successfully!")
+# Main chat interface
+if st.session_state.processed_file:
+    # Initialize chat history
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    # Display chat messages
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+            if "sources" in message:
+                with st.expander("View Sources"):
+                    for source in message["sources"]:
+                        st.markdown(f"**Page {source['page']}:**\n{source['text']}")
+    # Chat input
+    if prompt := st.chat_input("Ask a question about the concepts in your PDF"):
+        # Add user message to chat history
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        # Display user message
+        with st.chat_message("user"):
+            st.markdown(prompt)
+        # Get bot response
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                try:
+                    response = st.session_state.rag_engine.answer_question(prompt)
+                    # Display response
+                    st.markdown(response["answer"])
+                    # Display sources in expander
+                    with st.expander("View Sources"):
+                        for source in response["sources"]:
+                            st.markdown(f"**Page {source['page']}:**\n{source['text']}")
+                    # Add assistant response to chat history
+                    st.session_state.messages.append({
+                        "role": "assistant",
+                        "content": response["answer"],
+                        "sources": response["sources"]
+                    })
+                except ValueError as e:
+                    st.error(f"Configuration Error: {str(e)}")
+                    st.stop()
+                except ConnectionError as e:
+                    st.error(f"Connection Error: {str(e)}")
+                    st.stop()
+                except Exception as e:
+                    st.error(f"Unexpected Error: {str(e)}")
+                    st.stop()
+else:
+    st.info("Please upload a PDF file to start chatting.")

pdf_processor.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from typing import List, Dict
+from langchain.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+class PDFProcessor:
+    def __init__(self):
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            length_function=len,
+            separators=["\n\n", "\n", " ", ""]
+        )
+    def process_pdf(self, pdf_path: str) -> List[Dict]:
+        """
+        Process a PDF file and return chunks of text with metadata.
+        Args:
+            pdf_path (str): Path to the PDF file
+        Returns:
+            List[Dict]: List of dictionaries containing text chunks and metadata
+        """
+        # Load PDF
+        loader = PyPDFLoader(pdf_path)
+        pages = loader.load()
+        # Split text into chunks
+        chunks = self.text_splitter.split_documents(pages)
+        # Format chunks with metadata
+        processed_chunks = []
+        for chunk in chunks:
+            processed_chunks.append({
+                'text': chunk.page_content,
+                'metadata': {
+                    'page': chunk.metadata.get('page', 0) + 1,
+                    'source': pdf_path
+                }
+            })
+        return processed_chunks

rag_engine.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import os
+from typing import List, Dict
+from dotenv import load_dotenv
+import chromadb
+from langchain.embeddings import AzureOpenAIEmbeddings
+from langchain.vectorstores import Chroma
+from langchain.chat_models import AzureChatOpenAI
+from langchain.chains import RetrievalQA
+import time
+# Load environment variables
+load_dotenv()
+class RAGEngine:
+    def __init__(self):
+        # Verify Azure OpenAI settings are set
+        required_vars = [
+            'AZURE_OPENAI_ENDPOINT',
+            'AZURE_OPENAI_KEY',
+            'AZURE_OPENAI_DEPLOYMENT_NAME',
+            'AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'
+        ]
+        missing_vars = [var for var in required_vars if not os.getenv(var)]
+        if missing_vars:
+            raise ValueError(f"Missing required Azure OpenAI settings: {', '.join(missing_vars)}")
+        # Initialize with retry mechanism
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                self.embeddings = AzureOpenAIEmbeddings(
+                    azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
+                    azure_deployment=os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'),
+                    api_key=os.getenv('AZURE_OPENAI_KEY')
+                )
+                self.vector_store = None
+                self.qa_chain = None
+                # Test connection
+                self.embeddings.embed_query("test")
+                break
+            except Exception as e:
+                if attempt == max_retries - 1:
+                    raise ConnectionError(f"Failed to connect to Azure OpenAI API after {max_retries} attempts. Error: {str(e)}")
+                time.sleep(2)  # Wait before retrying
+    def initialize_vector_store(self, chunks: List[Dict]):
+        """
+        Initialize the vector store with document chunks.
+        Args:
+            chunks (List[Dict]): List of dictionaries containing text and metadata
+        """
+        texts = [chunk['text'] for chunk in chunks]
+        metadatas = [chunk['metadata'] for chunk in chunks]
+        # Create vector store
+        self.vector_store = Chroma.from_texts(
+            texts=texts,
+            embedding=self.embeddings,
+            metadatas=metadatas
+        )
+        # Initialize QA chain
+        llm = AzureChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", azure_deployment_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME'), azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'), api_key=os.getenv('AZURE_OPENAI_KEY'))
+        self.qa_chain = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=self.vector_store.as_retriever(
+                search_kwargs={"k": 3}
+            )
+        )
+    def answer_question(self, question: str) -> Dict:
+        """
+        Answer a question using the RAG system.
+        Args:
+            question (str): User's question
+        Returns:
+            Dict: Answer and source information
+        """
+        if not self.qa_chain:
+            raise ValueError("Vector store not initialized. Please process documents first.")
+        # Create a prompt that emphasizes definition extraction
+        prompt = f"""
+        Question: {question}
+        Please provide a clear and concise answer based on the provided context.
+        If the question asks for a definition or explanation of a concept,
+        make sure to provide that specifically. Include relevant examples or
+        additional context only if they help clarify the concept.
+        """
+        # Get answer from QA chain
+        result = self.qa_chain({"query": prompt})
+        # Get source documents
+        source_docs = self.vector_store.similarity_search(question, k=2)
+        sources = [
+            {
+                'page': doc.metadata['page'],
+                'text': doc.page_content[:200] + "..."  # Preview of source text
+            }
+            for doc in source_docs
+        ]
+        return {
+            'answer': result['result'],
+            'sources': sources
+        }

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import streamlit as st
+import os
+from dotenv import load_dotenv
+from pdf_processor import PDFProcessor
+from rag_engine import RAGEngine
+from app.config import AZURE_OPENAI_DEPLOYMENT_NAME
+# Load environment variables
+load_dotenv()
+# Initialize components
+pdf_processor = PDFProcessor()
+rag_engine = RAGEngine(deployment_name=AZURE_OPENAI_DEPLOYMENT_NAME)
+def main():
+    st.set_page_config(
+        page_title="CRE Knowledge Assistant",
+        page_icon="🤖",
+        layout="wide"
+    )
+    st.title("CRE Knowledge Assistant 🏢")
+    # File uploader
+    uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
+    if uploaded_file:
+        try:
+            # Process the PDF
+            pdf_processor.process(uploaded_file)
+            st.success("PDF processed successfully! You can now ask questions about it.")
+            # Show chat interface
+            user_question = st.text_input("Ask a question about the document:")
+            if user_question:
+                response = rag_engine.get_response(user_question)
+                st.write("Answer:", response)
+        except Exception as e:
+            st.error(f"Error processing PDF: {str(e)}")
+if __name__ == "__main__":
+    main()