tony-42069 commited on
Commit
4ad48e3
·
1 Parent(s): 0169adb

Add core configuration files

Browse files
.deployment DELETED
@@ -1,3 +0,0 @@
1
- [config]
2
- SCM_DO_BUILD_DURING_DEPLOYMENT=true
3
- PYTHON_ENABLE_GUNICORN=false
 
 
 
 
.devcontainer/devcontainer.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "name": "Python 3",
3
- // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
4
- "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
5
- "customizations": {
6
- "codespaces": {
7
- "openFiles": [
8
- "README.md",
9
- "streamlit_app.py"
10
- ]
11
- },
12
- "vscode": {
13
- "settings": {},
14
- "extensions": [
15
- "ms-python.python",
16
- "ms-python.vscode-pylance"
17
- ]
18
- }
19
- },
20
- "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
21
- "postAttachCommand": {
22
- "server": "streamlit run streamlit_app.py --server.enableCORS false --server.enableXsrfProtection false"
23
- },
24
- "portsAttributes": {
25
- "8501": {
26
- "label": "Application",
27
- "onAutoForward": "openPreview"
28
- }
29
- },
30
- "forwardPorts": [
31
- 8501
32
- ]
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.dockerignore DELETED
@@ -1,9 +0,0 @@
1
- .git
2
- .gitignore
3
- .env
4
- __pycache__
5
- *.pyc
6
- vector_store/
7
- venv/
8
- .pytest_cache/
9
- logs/
 
 
 
 
 
 
 
 
 
 
.gitattributes DELETED
@@ -1 +0,0 @@
1
- *.pdf filter=lfs diff=lfs merge=lfs -text
 
 
.github/workflows/huggingface-spaces-sync.yml DELETED
@@ -1,30 +0,0 @@
1
- name: Sync to Hugging Face Spaces
2
- on:
3
- push:
4
- branches: [main]
5
-
6
- jobs:
7
- sync:
8
- runs-on: ubuntu-latest
9
- steps:
10
- - uses: actions/checkout@v3
11
- with:
12
- fetch-depth: 0
13
- lfs: true
14
-
15
- - name: Setup Git LFS
16
- run: |
17
- curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
18
- sudo apt-get install git-lfs
19
- git lfs install
20
-
21
- - name: Push to Hugging Face Spaces
22
- env:
23
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
24
- run: |
25
- git config --global user.email "[email protected]"
26
- git config --global user.name "GitHub Actions"
27
- git remote add hf https://tony-42069:[email protected]/spaces/tony-42069/cre-chatbot-rag
28
- git fetch hf
29
- git lfs push --all hf main
30
- git push -f hf main
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore DELETED
@@ -1,35 +0,0 @@
1
- # Next.js
2
- .next/
3
- node_modules/
4
- out/
5
-
6
- # Virtual environment
7
- venv/
8
- env/
9
- ENV/
10
-
11
- # Python
12
- __pycache__/
13
- *.py[cod]
14
- *$py.class
15
-
16
- # Distribution / packaging
17
- dist/
18
- build/
19
- *.egg-info/
20
-
21
- # Local development settings
22
- .env
23
- .env.local
24
-
25
- # IDE
26
- .vscode/
27
- .idea/
28
-
29
- # Operating System
30
- .DS_Store
31
- Thumbs.db
32
-
33
- # Misc
34
- *.pem
35
- .vercel
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.streamlit/config.toml DELETED
@@ -1,9 +0,0 @@
1
- [theme]
2
- primaryColor = "#FF4B4B"
3
- backgroundColor = "#FFFFFF"
4
- secondaryBackgroundColor = "#F0F2F6"
5
- textColor = "#262730"
6
- font = "sans serif"
7
-
8
- [server]
9
- maxUploadSize = 200
 
 
 
 
 
 
 
 
 
 
Dataset/Commercial Lending 101.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:46f8deca30acd6c8b45ca371babf4bbcd1848916f09c088a33e2edcf46164746
3
- size 6879185
 
 
 
 
Dockerfile DELETED
@@ -1,16 +0,0 @@
1
- FROM python:3.11-slim
2
-
3
- WORKDIR /app
4
-
5
- COPY requirements.txt .
6
- RUN pip install -r requirements.txt
7
-
8
- COPY . .
9
-
10
- # Make port configurable via environment variable
11
- ENV PORT=8501
12
-
13
- EXPOSE ${PORT}
14
-
15
- # Use the correct path to app.py and make port configurable
16
- ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=${PORT}", "--server.address=0.0.0.0"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md DELETED
@@ -1,122 +0,0 @@
1
- # Commercial Real Estate Knowledge Assistant
2
-
3
- ![Commercial Lending 101](Dataset/commercial-lending-101.png)
4
-
5
- A sophisticated Retrieval-Augmented Generation (RAG) chatbot that transforms how professionals understand commercial real estate concepts. Built with Azure OpenAI and modern Python technologies, this assistant processes commercial real estate documentation and provides accurate, context-aware answers to your questions.
6
-
7
- ## 🚀 Deployments
8
- - **Live Demo**: [Try it on Hugging Face Spaces](https://huggingface.co/spaces/tony-42069/cre-chatbot-rag)
9
-
10
- ## 🌟 Key Features
11
- - **Multi-Document Support**: Process and analyze multiple PDF documents simultaneously
12
- - **Intelligent PDF Processing**: Advanced document analysis and text extraction
13
- - **Azure OpenAI Integration**: Leveraging GPT-3.5 Turbo for accurate, contextual responses
14
- - **Semantic Search**: Using Azure OpenAI embeddings for precise context retrieval
15
- - **Vector Storage**: Efficient document indexing with ChromaDB
16
- - **Modern UI**: Beautiful chat interface with message history and source tracking
17
- - **Enterprise-Ready**: Comprehensive logging and error handling
18
-
19
- ## 🎯 Use Cases
20
- - **Training & Education**: Help new CRE professionals understand industry concepts
21
- - **Quick Reference**: Instant access to definitions and explanations
22
- - **Document Analysis**: Extract insights from CRE documentation
23
- - **Knowledge Base**: Build and query your own CRE knowledge repository
24
-
25
- ## 🚀 Quick Start
26
-
27
- ### Prerequisites
28
- - Python 3.8+
29
- - Azure OpenAI Service access with:
30
- - `gpt-35-turbo` model deployment
31
- - `text-embedding-ada-002` model deployment
32
-
33
- ### Installation
34
- 1. Clone the repository:
35
- ```bash
36
- git clone https://github.com/tony-42069/cre-chatbot-rag.git
37
- cd cre-chatbot-rag
38
- ```
39
-
40
- 2. Create and activate virtual environment:
41
- ```bash
42
- python -m venv venv
43
- venv\Scripts\activate
44
- ```
45
-
46
- 3. Install dependencies:
47
- ```bash
48
- pip install -r requirements.txt
49
- ```
50
-
51
- 4. Create `.env` file with Azure OpenAI credentials:
52
- ```env
53
- AZURE_OPENAI_ENDPOINT=your_endpoint_here
54
- AZURE_OPENAI_KEY=your_key_here
55
- AZURE_OPENAI_DEPLOYMENT_NAME=your_gpt_deployment_name
56
- AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME=text-embedding-ada-002
57
- ```
58
-
59
- 5. Run the application:
60
- ```bash
61
- streamlit run app/main.py
62
- ```
63
-
64
- ## 🔌 Embedding
65
- To embed this chatbot in your website, use the following HTML code:
66
-
67
- ```html
68
- <iframe
69
- src="https://tony-42069-cre-chatbot-rag.hf.space"
70
- frameborder="0"
71
- width="850px"
72
- height="450px"
73
- ></iframe>
74
- ```
75
-
76
- ## 💡 Features
77
-
78
- ### Modern Chat Interface
79
- - Clean, professional design
80
- - Persistent chat history
81
- - Source context tracking
82
- - Multiple document management
83
- - Real-time processing feedback
84
-
85
- ### Advanced RAG Implementation
86
- - Semantic chunking of documents
87
- - Azure OpenAI embeddings for accurate retrieval
88
- - Context-aware answer generation
89
- - Multi-document knowledge base
90
- - Source attribution for answers
91
-
92
- ### Enterprise Security
93
- - Secure credential management
94
- - Azure OpenAI integration
95
- - Local vector storage with ChromaDB
96
- - Comprehensive error handling
97
- - Detailed logging system
98
-
99
- ## 🛠️ Technical Stack
100
- - **Frontend**: Streamlit
101
- - **Language Models**: Azure OpenAI (GPT-3.5 Turbo)
102
- - **Embeddings**: Azure OpenAI (text-embedding-ada-002)
103
- - **Vector Store**: ChromaDB
104
- - **PDF Processing**: PyPDF2
105
- - **Framework**: LangChain
106
-
107
- ## 📚 Documentation
108
- - [Azure OpenAI Service](https://azure.microsoft.com/en-us/products/cognitive-services/openai-service/)
109
- - [Streamlit](https://streamlit.io/)
110
- - [LangChain](https://python.langchain.com/)
111
- - [ChromaDB](https://www.trychroma.com/)
112
-
113
- ## 🤝 Contributing
114
- Contributions are welcome! Please feel free to submit a Pull Request.
115
-
116
- ## 📄 License
117
- This project is licensed under the MIT License - see the LICENSE file for details.
118
-
119
- ## 🙏 Acknowledgments
120
- - Azure OpenAI team for providing the powerful language models
121
- - LangChain community for the excellent RAG framework
122
- - Streamlit team for the amazing web framework
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
api/__init__.py DELETED
@@ -1,8 +0,0 @@
1
- import azure.functions as func
2
- import streamlit as st
3
-
4
- def main(req: func.HttpRequest) -> func.HttpResponse:
5
- return func.HttpResponse(
6
- "This is the API endpoint for the CRE Knowledge Assistant",
7
- status_code=200
8
- )
 
 
 
 
 
 
 
 
 
api/function_app.py DELETED
@@ -1,71 +0,0 @@
1
- import azure.functions as func
2
- import logging
3
- import json
4
- from io import BytesIO
5
-
6
- # Add the project root to Python path
7
- import sys
8
- import os
9
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
10
-
11
- from app.config import validate_config
12
- from app.logging import setup_logging
13
- from src.pdf_processor import PDFProcessor
14
- from src.rag_engine import RAGEngine
15
-
16
- # Initialize components
17
- setup_logging()
18
- logger = logging.getLogger('app')
19
- pdf_processor = PDFProcessor()
20
- rag_engine = RAGEngine()
21
-
22
- def process_pdf(req: func.HttpRequest) -> func.HttpResponse:
23
- try:
24
- # Get the PDF file from the request
25
- pdf_file = req.files['file']
26
- pdf_bytes = pdf_file.read()
27
-
28
- # Process the PDF
29
- pdf_processor.process(BytesIO(pdf_bytes))
30
-
31
- return func.HttpResponse(
32
- json.dumps({"message": "PDF processed successfully"}),
33
- mimetype="application/json",
34
- status_code=200
35
- )
36
- except Exception as e:
37
- logger.error(f"Error processing PDF: {str(e)}")
38
- return func.HttpResponse(
39
- json.dumps({"error": str(e)}),
40
- mimetype="application/json",
41
- status_code=500
42
- )
43
-
44
- def query(req: func.HttpRequest) -> func.HttpResponse:
45
- try:
46
- # Get the query from request body
47
- req_body = req.get_json()
48
- user_query = req_body.get('query')
49
-
50
- if not user_query:
51
- return func.HttpResponse(
52
- json.dumps({"error": "No query provided"}),
53
- mimetype="application/json",
54
- status_code=400
55
- )
56
-
57
- # Process query through RAG engine
58
- answer = rag_engine.process_query(user_query)
59
-
60
- return func.HttpResponse(
61
- json.dumps({"answer": answer}),
62
- mimetype="application/json",
63
- status_code=200
64
- )
65
- except Exception as e:
66
- logger.error(f"Error processing query: {str(e)}")
67
- return func.HttpResponse(
68
- json.dumps({"error": str(e)}),
69
- mimetype="application/json",
70
- status_code=500
71
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
api/requirements.txt DELETED
@@ -1,7 +0,0 @@
1
- azure-functions==1.15.0
2
- openai==1.6.1
3
- python-dotenv==1.0.0
4
- azure-cognitiveservices-language-textanalytics==0.2.0
5
- PyPDF2==3.0.1
6
- langchain==0.0.352
7
- azure-storage-blob==12.19.0
 
 
 
 
 
 
 
 
app.py DELETED
@@ -1,116 +0,0 @@
1
- import streamlit as st
2
- import tempfile
3
- import os
4
- from pdf_processor import PDFProcessor
5
- from rag_engine import RAGEngine
6
-
7
- # Initialize session state
8
- if 'rag_engine' not in st.session_state:
9
- try:
10
- st.session_state.rag_engine = RAGEngine()
11
- except ValueError as e:
12
- st.error(f"Configuration Error: {str(e)}")
13
- st.stop()
14
- except ConnectionError as e:
15
- st.error(f"Connection Error: {str(e)}")
16
- st.stop()
17
- except Exception as e:
18
- st.error(f"Unexpected Error: {str(e)}")
19
- st.stop()
20
-
21
- if 'processed_file' not in st.session_state:
22
- st.session_state.processed_file = False
23
-
24
- # Page config
25
- st.set_page_config(page_title="Concept Definition Chatbot", layout="wide")
26
- st.title("Concept Definition Chatbot")
27
-
28
- # Sidebar for PDF upload
29
- with st.sidebar:
30
- st.header("Upload PDF")
31
- uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
32
-
33
- if uploaded_file is not None and not st.session_state.processed_file:
34
- with st.spinner("Processing PDF..."):
35
- try:
36
- # Save uploaded file temporarily
37
- with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
38
- tmp_file.write(uploaded_file.getvalue())
39
- tmp_path = tmp_file.name
40
-
41
- # Process PDF
42
- processor = PDFProcessor()
43
- chunks = processor.process_pdf(tmp_path)
44
-
45
- # Initialize RAG engine
46
- st.session_state.rag_engine.initialize_vector_store(chunks)
47
- st.session_state.processed_file = True
48
-
49
- # Clean up
50
- os.unlink(tmp_path)
51
- except ValueError as e:
52
- st.error(f"Configuration Error: {str(e)}")
53
- st.stop()
54
- except ConnectionError as e:
55
- st.error(f"Connection Error: {str(e)}")
56
- st.stop()
57
- except Exception as e:
58
- st.error(f"Unexpected Error: {str(e)}")
59
- st.stop()
60
- st.success("PDF processed successfully!")
61
-
62
- # Main chat interface
63
- if st.session_state.processed_file:
64
- # Initialize chat history
65
- if "messages" not in st.session_state:
66
- st.session_state.messages = []
67
-
68
- # Display chat messages
69
- for message in st.session_state.messages:
70
- with st.chat_message(message["role"]):
71
- st.markdown(message["content"])
72
- if "sources" in message:
73
- with st.expander("View Sources"):
74
- for source in message["sources"]:
75
- st.markdown(f"**Page {source['page']}:**\n{source['text']}")
76
-
77
- # Chat input
78
- if prompt := st.chat_input("Ask a question about the concepts in your PDF"):
79
- # Add user message to chat history
80
- st.session_state.messages.append({"role": "user", "content": prompt})
81
-
82
- # Display user message
83
- with st.chat_message("user"):
84
- st.markdown(prompt)
85
-
86
- # Get bot response
87
- with st.chat_message("assistant"):
88
- with st.spinner("Thinking..."):
89
- try:
90
- response = st.session_state.rag_engine.answer_question(prompt)
91
-
92
- # Display response
93
- st.markdown(response["answer"])
94
-
95
- # Display sources in expander
96
- with st.expander("View Sources"):
97
- for source in response["sources"]:
98
- st.markdown(f"**Page {source['page']}:**\n{source['text']}")
99
-
100
- # Add assistant response to chat history
101
- st.session_state.messages.append({
102
- "role": "assistant",
103
- "content": response["answer"],
104
- "sources": response["sources"]
105
- })
106
- except ValueError as e:
107
- st.error(f"Configuration Error: {str(e)}")
108
- st.stop()
109
- except ConnectionError as e:
110
- st.error(f"Connection Error: {str(e)}")
111
- st.stop()
112
- except Exception as e:
113
- st.error(f"Unexpected Error: {str(e)}")
114
- st.stop()
115
- else:
116
- st.info("Please upload a PDF file to start chatting.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/__init__.py DELETED
@@ -1 +0,0 @@
1
-
 
 
app/config.py DELETED
@@ -1,45 +0,0 @@
1
- """
2
- Configuration management for the CRE Chatbot application.
3
- """
4
- import os
5
- from dotenv import load_dotenv
6
-
7
- # Load environment variables
8
- load_dotenv()
9
-
10
- # Azure OpenAI Configuration
11
- AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
12
- AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_KEY')
13
- AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME')
14
- AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME = os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME')
15
-
16
- # Application Configuration
17
- MAX_CHUNK_SIZE = 1000
18
- OVERLAP_SIZE = 200
19
- TEMPERATURE = 0.7
20
- MAX_TOKENS = 500
21
-
22
- # Logging Configuration
23
- LOG_LEVEL = "INFO"
24
- LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
25
- LOG_FILE = "logs/app.log"
26
-
27
- # Vector Store Configuration
28
- VECTOR_STORE_PATH = "vector_store"
29
-
30
- def validate_config():
31
- """Validate that all required configuration variables are set."""
32
- required_vars = [
33
- 'AZURE_OPENAI_ENDPOINT',
34
- 'AZURE_OPENAI_API_KEY',
35
- 'AZURE_OPENAI_DEPLOYMENT_NAME',
36
- 'AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'
37
- ]
38
-
39
- missing_vars = [var for var in required_vars if not os.getenv(var)]
40
-
41
- if missing_vars:
42
- raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
43
-
44
- # Validate that all required configuration variables are set.
45
- validate_config()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/logging.py DELETED
@@ -1,59 +0,0 @@
1
- """
2
- Logging configuration for the CRE Chatbot application.
3
- """
4
- import logging
5
- import os
6
- from logging.handlers import RotatingFileHandler
7
- from .config import LOG_LEVEL, LOG_FORMAT, LOG_FILE
8
-
9
- def setup_logging():
10
- """Set up logging configuration for the application."""
11
- # Create logs directory if it doesn't exist
12
- os.makedirs('logs', exist_ok=True)
13
-
14
- # Set up root logger
15
- logger = logging.getLogger()
16
- logger.setLevel(LOG_LEVEL)
17
-
18
- # Create formatters and handlers
19
- formatter = logging.Formatter(LOG_FORMAT)
20
-
21
- # Console Handler
22
- console_handler = logging.StreamHandler()
23
- console_handler.setFormatter(formatter)
24
- logger.addHandler(console_handler)
25
-
26
- # File Handler
27
- file_handler = RotatingFileHandler(
28
- LOG_FILE,
29
- maxBytes=10485760, # 10MB
30
- backupCount=5
31
- )
32
- file_handler.setFormatter(formatter)
33
- logger.addHandler(file_handler)
34
-
35
- # Create separate loggers for different components
36
- loggers = {
37
- 'api': setup_component_logger('api'),
38
- 'pdf': setup_component_logger('pdf'),
39
- 'rag': setup_component_logger('rag'),
40
- 'app': setup_component_logger('app')
41
- }
42
-
43
- return loggers
44
-
45
- def setup_component_logger(name):
46
- """Set up a logger for a specific component."""
47
- logger = logging.getLogger(name)
48
- logger.setLevel(LOG_LEVEL)
49
-
50
- # Create component-specific log file
51
- handler = RotatingFileHandler(
52
- f'logs/{name}.log',
53
- maxBytes=10485760, # 10MB
54
- backupCount=3
55
- )
56
- handler.setFormatter(logging.Formatter(LOG_FORMAT))
57
- logger.addHandler(handler)
58
-
59
- return logger
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/main.py DELETED
@@ -1,209 +0,0 @@
1
- """
2
- Main Streamlit application for the CRE Chatbot.
3
- """
4
- import logging
5
- import streamlit as st
6
- from io import BytesIO
7
- import sys
8
- import os
9
-
10
- # Add the project root to Python path
11
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
12
-
13
- from app.config import validate_config, AZURE_OPENAI_DEPLOYMENT_NAME
14
- from app.logging import setup_logging
15
- from src.pdf_processor import PDFProcessor
16
- from src.rag_engine import RAGEngine
17
-
18
- # Setup logging
19
- loggers = setup_logging()
20
- logger = logging.getLogger('app')
21
-
22
- # Page configuration
23
- st.set_page_config(
24
- page_title="CRE Knowledge Assistant",
25
- page_icon="🏢",
26
- layout="wide",
27
- initial_sidebar_state="expanded"
28
- )
29
-
30
- # Custom CSS
31
- st.markdown("""
32
- <style>
33
- .main {
34
- background-color: #f5f5f5;
35
- }
36
- .stApp {
37
- max-width: 1200px;
38
- margin: 0 auto;
39
- }
40
- .chat-message {
41
- padding: 1.5rem;
42
- border-radius: 0.5rem;
43
- margin-bottom: 1rem;
44
- display: flex;
45
- flex-direction: column;
46
- }
47
- .chat-message.user {
48
- background-color: #e3f2fd;
49
- }
50
- .chat-message.assistant {
51
- background-color: #f3e5f5;
52
- }
53
- .chat-message .message {
54
- margin-top: 0.5rem;
55
- }
56
- </style>
57
- """, unsafe_allow_html=True)
58
-
59
- # Initialize session state
60
- if 'rag_engine' not in st.session_state:
61
- st.session_state.rag_engine = None
62
- if 'pdf_processor' not in st.session_state:
63
- st.session_state.pdf_processor = PDFProcessor()
64
- if 'chat_history' not in st.session_state:
65
- st.session_state.chat_history = []
66
- if 'uploaded_pdfs' not in st.session_state:
67
- st.session_state.uploaded_pdfs = set()
68
-
69
- def initialize_rag_engine(deployment_name: str):
70
- """Initialize the RAG engine with error handling."""
71
- try:
72
- st.session_state.rag_engine = RAGEngine(deployment_name)
73
- logger.info("RAG Engine initialized successfully")
74
- except Exception as e:
75
- logger.error(f"Error initializing the application: {str(e)}")
76
- st.error(f"Error initializing the application: {str(e)}")
77
-
78
- def process_pdf(pdf_file):
79
- """Process uploaded PDF file."""
80
- try:
81
- # Check if PDF was already processed
82
- if pdf_file.name in st.session_state.uploaded_pdfs:
83
- st.warning(f"'{pdf_file.name}' has already been processed!")
84
- return
85
-
86
- with st.spinner(f"Processing {pdf_file.name}..."):
87
- # Read PDF content
88
- pdf_content = pdf_file.read()
89
-
90
- # Process PDF and get chunks
91
- chunks = st.session_state.pdf_processor.process_pdf(
92
- BytesIO(pdf_content)
93
- )
94
-
95
- # Add chunks to vector store
96
- texts = [chunk[0] for chunk in chunks]
97
- metadata = [{"source": pdf_file.name, **chunk[1]} for chunk in chunks]
98
- st.session_state.rag_engine.add_documents(texts, metadata)
99
-
100
- # Mark PDF as processed
101
- st.session_state.uploaded_pdfs.add(pdf_file.name)
102
-
103
- st.success(f"Successfully processed '{pdf_file.name}'!")
104
- logger.info(f"PDF '{pdf_file.name}' processed and added to vector store")
105
-
106
- except Exception as e:
107
- logger.error(f"Error processing PDF: {str(e)}")
108
- st.error(f"Error processing PDF: {str(e)}")
109
-
110
- def display_chat_message(role: str, content: str):
111
- """Display a chat message with proper styling."""
112
- with st.container():
113
- st.markdown(f"""
114
- <div class="chat-message {role}">
115
- <div class="role"><strong>{'You' if role == 'user' else 'Assistant'}:</strong></div>
116
- <div class="message">{content}</div>
117
- </div>
118
- """, unsafe_allow_html=True)
119
-
120
- def main():
121
- """Main application function."""
122
- # Header
123
- col1, col2 = st.columns([2, 1])
124
- with col1:
125
- st.title("🏢 CRE Knowledge Assistant")
126
- st.markdown("*Your AI guide for commercial real estate concepts*")
127
-
128
- # Sidebar
129
- with st.sidebar:
130
- st.header("📚 Knowledge Base")
131
- st.markdown("Upload your CRE documents to enhance the assistant's knowledge.")
132
-
133
- # Model configuration (collapsible)
134
- with st.expander("⚙️ Model Configuration"):
135
- deployment_name = st.text_input(
136
- "Model Deployment Name",
137
- value=AZURE_OPENAI_DEPLOYMENT_NAME,
138
- help="Enter your Azure OpenAI model deployment name"
139
- )
140
-
141
- # Initialize RAG engine if not already done
142
- if not st.session_state.rag_engine:
143
- initialize_rag_engine(deployment_name)
144
-
145
- # PDF upload section
146
- st.subheader("📄 Upload Documents")
147
- uploaded_files = st.file_uploader(
148
- "Choose PDF files",
149
- type="pdf",
150
- accept_multiple_files=True,
151
- help="Upload one or more PDF files to add to the knowledge base"
152
- )
153
-
154
- if uploaded_files:
155
- for pdf_file in uploaded_files:
156
- process_pdf(pdf_file)
157
-
158
- # Show processed documents
159
- if st.session_state.uploaded_pdfs:
160
- st.subheader("📚 Processed Documents")
161
- for pdf_name in st.session_state.uploaded_pdfs:
162
- st.markdown(f"✓ {pdf_name}")
163
-
164
- # Main chat interface
165
- if st.session_state.rag_engine:
166
- # Display chat history
167
- for message in st.session_state.chat_history:
168
- display_chat_message(
169
- role=message["role"],
170
- content=message["content"]
171
- )
172
-
173
- # Chat input
174
- user_question = st.text_input(
175
- "Ask a question about commercial real estate:",
176
- placeholder="e.g., What is LTV? How is DSCR calculated?",
177
- key="user_question"
178
- )
179
-
180
- if user_question:
181
- try:
182
- # Add user message to chat
183
- st.session_state.chat_history.append({
184
- "role": "user",
185
- "content": user_question
186
- })
187
-
188
- with st.spinner("Generating answer..."):
189
- response = st.session_state.rag_engine.query(user_question)
190
-
191
- # Add assistant response to chat
192
- st.session_state.chat_history.append({
193
- "role": "assistant",
194
- "content": response["answer"]
195
- })
196
-
197
- # Display latest messages immediately
198
- display_chat_message("user", user_question)
199
- display_chat_message("assistant", response["answer"])
200
-
201
- except Exception as e:
202
- logger.error(f"Error generating answer: {str(e)}")
203
- st.error(f"Error generating answer: {str(e)}")
204
-
205
- else:
206
- st.info("👆 Please upload PDF documents in the sidebar to start asking questions!")
207
-
208
- if __name__ == "__main__":
209
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docker-compose.yml DELETED
@@ -1,15 +0,0 @@
1
- version: '3.8'
2
-
3
- services:
4
- chatbot:
5
- build: .
6
- ports:
7
- - "8501:8501"
8
- environment:
9
- - AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_ENDPOINT}
10
- - AZURE_OPENAI_KEY=${AZURE_OPENAI_KEY}
11
- - AZURE_OPENAI_DEPLOYMENT_NAME=${AZURE_OPENAI_DEPLOYMENT_NAME}
12
- - AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME=${AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME}
13
- volumes:
14
- - ./vector_store:/app/vector_store
15
- - ./logs:/app/logs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
frontend/main.py DELETED
@@ -1,59 +0,0 @@
1
- import streamlit as st
2
- import requests
3
- import sys
4
- import os
5
-
6
- # Add the project root to Python path
7
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
8
-
9
- from app.config import validate_config
10
- from app.logging import setup_logging
11
-
12
- def main():
13
- # Setup logging
14
- setup_logging()
15
-
16
- st.set_page_config(
17
- page_title="CRE Knowledge Assistant",
18
- page_icon="🤖",
19
- layout="wide"
20
- )
21
-
22
- st.title("CRE Knowledge Assistant")
23
-
24
- # File uploader
25
- uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
26
-
27
- if uploaded_file:
28
- # Convert file to bytes
29
- file_bytes = uploaded_file.getvalue()
30
-
31
- # Send to API endpoint
32
- response = requests.post(
33
- "api/process_pdf",
34
- files={"file": (uploaded_file.name, file_bytes, "application/pdf")}
35
- )
36
-
37
- if response.status_code == 200:
38
- st.success("PDF processed successfully!")
39
- else:
40
- st.error("Error processing PDF")
41
-
42
- # Query input
43
- query = st.text_input("Ask a question about your documents:")
44
-
45
- if query:
46
- # Send query to API endpoint
47
- response = requests.post(
48
- "api/query",
49
- json={"query": query}
50
- )
51
-
52
- if response.status_code == 200:
53
- result = response.json()
54
- st.write("Answer:", result["answer"])
55
- else:
56
- st.error("Error processing query")
57
-
58
- if __name__ == "__main__":
59
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
frontend/requirements.txt DELETED
@@ -1,3 +0,0 @@
1
- streamlit==1.29.0
2
- requests==2.31.0
3
- python-dotenv==1.0.0
 
 
 
 
index.html DELETED
@@ -1,30 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <title>CRE Knowledge Assistant</title>
5
- <style>
6
- body, html {
7
- margin: 0;
8
- padding: 0;
9
- height: 100%;
10
- overflow: hidden;
11
- }
12
- iframe {
13
- width: 100%;
14
- height: 100vh;
15
- border: none;
16
- }
17
- </style>
18
- </head>
19
- <body>
20
- <iframe src="/api" allow="camera;microphone"></iframe>
21
- <script>
22
- window.addEventListener('message', function(e) {
23
- // Handle any messages from the Streamlit app
24
- if (e.data.type === 'streamlit') {
25
- console.log('Received message from Streamlit:', e.data);
26
- }
27
- });
28
- </script>
29
- </body>
30
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
package-lock.json DELETED
The diff for this file is too large to render. See raw diff
 
package.json DELETED
@@ -1,21 +0,0 @@
1
- {
2
- "name": "cre-chatbot-rag",
3
- "version": "1.0.0",
4
- "private": true,
5
- "scripts": {
6
- "dev": "next dev",
7
- "build": "next build",
8
- "start": "next start"
9
- },
10
- "dependencies": {
11
- "@chakra-ui/react": "^2.8.2",
12
- "@emotion/react": "^11.11.1",
13
- "@emotion/styled": "^11.11.0",
14
- "axios": "^1.6.2",
15
- "framer-motion": "^10.16.16",
16
- "next": "^14.0.4",
17
- "react": "^18.2.0",
18
- "react-dom": "^18.2.0",
19
- "vercel": "^39.1.1"
20
- }
21
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pdf_processor.py DELETED
@@ -1,42 +0,0 @@
1
- from typing import List, Dict
2
- from langchain.document_loaders import PyPDFLoader
3
- from langchain.text_splitter import RecursiveCharacterTextSplitter
4
-
5
- class PDFProcessor:
6
- def __init__(self):
7
- self.text_splitter = RecursiveCharacterTextSplitter(
8
- chunk_size=1000,
9
- chunk_overlap=200,
10
- length_function=len,
11
- separators=["\n\n", "\n", " ", ""]
12
- )
13
-
14
- def process_pdf(self, pdf_path: str) -> List[Dict]:
15
- """
16
- Process a PDF file and return chunks of text with metadata.
17
-
18
- Args:
19
- pdf_path (str): Path to the PDF file
20
-
21
- Returns:
22
- List[Dict]: List of dictionaries containing text chunks and metadata
23
- """
24
- # Load PDF
25
- loader = PyPDFLoader(pdf_path)
26
- pages = loader.load()
27
-
28
- # Split text into chunks
29
- chunks = self.text_splitter.split_documents(pages)
30
-
31
- # Format chunks with metadata
32
- processed_chunks = []
33
- for chunk in chunks:
34
- processed_chunks.append({
35
- 'text': chunk.page_content,
36
- 'metadata': {
37
- 'page': chunk.metadata.get('page', 0) + 1,
38
- 'source': pdf_path
39
- }
40
- })
41
-
42
- return processed_chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rag_engine.py DELETED
@@ -1,112 +0,0 @@
1
- import os
2
- from typing import List, Dict
3
- from dotenv import load_dotenv
4
- import chromadb
5
- from langchain.embeddings import AzureOpenAIEmbeddings
6
- from langchain.vectorstores import Chroma
7
- from langchain.chat_models import AzureChatOpenAI
8
- from langchain.chains import RetrievalQA
9
- import time
10
-
11
- # Load environment variables
12
- load_dotenv()
13
-
14
- class RAGEngine:
15
- def __init__(self):
16
- # Verify Azure OpenAI settings are set
17
- required_vars = [
18
- 'AZURE_OPENAI_ENDPOINT',
19
- 'AZURE_OPENAI_KEY',
20
- 'AZURE_OPENAI_DEPLOYMENT_NAME',
21
- 'AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'
22
- ]
23
-
24
- missing_vars = [var for var in required_vars if not os.getenv(var)]
25
- if missing_vars:
26
- raise ValueError(f"Missing required Azure OpenAI settings: {', '.join(missing_vars)}")
27
-
28
- # Initialize with retry mechanism
29
- max_retries = 3
30
- for attempt in range(max_retries):
31
- try:
32
- self.embeddings = AzureOpenAIEmbeddings(
33
- azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
34
- azure_deployment=os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'),
35
- api_key=os.getenv('AZURE_OPENAI_KEY')
36
- )
37
- self.vector_store = None
38
- self.qa_chain = None
39
- # Test connection
40
- self.embeddings.embed_query("test")
41
- break
42
- except Exception as e:
43
- if attempt == max_retries - 1:
44
- raise ConnectionError(f"Failed to connect to Azure OpenAI API after {max_retries} attempts. Error: {str(e)}")
45
- time.sleep(2) # Wait before retrying
46
-
47
- def initialize_vector_store(self, chunks: List[Dict]):
48
- """
49
- Initialize the vector store with document chunks.
50
-
51
- Args:
52
- chunks (List[Dict]): List of dictionaries containing text and metadata
53
- """
54
- texts = [chunk['text'] for chunk in chunks]
55
- metadatas = [chunk['metadata'] for chunk in chunks]
56
-
57
- # Create vector store
58
- self.vector_store = Chroma.from_texts(
59
- texts=texts,
60
- embedding=self.embeddings,
61
- metadatas=metadatas
62
- )
63
-
64
- # Initialize QA chain
65
- llm = AzureChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", azure_deployment_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME'), azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'), api_key=os.getenv('AZURE_OPENAI_KEY'))
66
- self.qa_chain = RetrievalQA.from_chain_type(
67
- llm=llm,
68
- chain_type="stuff",
69
- retriever=self.vector_store.as_retriever(
70
- search_kwargs={"k": 3}
71
- )
72
- )
73
-
74
- def answer_question(self, question: str) -> Dict:
75
- """
76
- Answer a question using the RAG system.
77
-
78
- Args:
79
- question (str): User's question
80
-
81
- Returns:
82
- Dict: Answer and source information
83
- """
84
- if not self.qa_chain:
85
- raise ValueError("Vector store not initialized. Please process documents first.")
86
-
87
- # Create a prompt that emphasizes definition extraction
88
- prompt = f"""
89
- Question: {question}
90
- Please provide a clear and concise answer based on the provided context.
91
- If the question asks for a definition or explanation of a concept,
92
- make sure to provide that specifically. Include relevant examples or
93
- additional context only if they help clarify the concept.
94
- """
95
-
96
- # Get answer from QA chain
97
- result = self.qa_chain({"query": prompt})
98
-
99
- # Get source documents
100
- source_docs = self.vector_store.similarity_search(question, k=2)
101
- sources = [
102
- {
103
- 'page': doc.metadata['page'],
104
- 'text': doc.page_content[:200] + "..." # Preview of source text
105
- }
106
- for doc in source_docs
107
- ]
108
-
109
- return {
110
- 'answer': result['result'],
111
- 'sources': sources
112
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
railway.toml DELETED
@@ -1,8 +0,0 @@
1
- [build]
2
- builder = "NIXPACKS"
3
- buildCommand = "apt-get update && apt-get install -y build-essential && pip install --upgrade pip && pip install -r requirements.txt"
4
-
5
- [deploy]
6
- startCommand = "streamlit run streamlit_app.py --server.address=0.0.0.0 --server.port=$PORT"
7
- restartPolicyType = "ON_FAILURE"
8
- restartPolicyMaxRetries = 10
 
 
 
 
 
 
 
 
 
requirements-dev.txt DELETED
@@ -1,20 +0,0 @@
1
- -r requirements.txt
2
-
3
- # Testing
4
- pytest==7.4.3
5
- pytest-cov==4.1.0
6
-
7
- # Linting
8
- flake8==6.1.0
9
- black==23.11.0
10
-
11
- # Type checking
12
- mypy==1.7.1
13
-
14
- # Documentation
15
- sphinx==7.2.6
16
- sphinx-rtd-theme==1.3.0
17
-
18
- # Development tools
19
- pre-commit==3.5.0
20
- python-dotenv==1.0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt DELETED
@@ -1,13 +0,0 @@
1
- streamlit==1.29.0
2
- openai==1.6.1
3
- python-dotenv==1.0.0
4
- PyPDF2==3.0.1
5
- langchain==0.0.352
6
- chromadb==0.3.26
7
- pydantic==1.10.13
8
- azure-storage-blob==12.19.0
9
- numpy==1.22.4
10
- duckdb==0.9.2
11
- typing-inspect==0.8.0
12
- overrides==7.3.1
13
- SQLAlchemy==2.0.19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements_api.txt DELETED
@@ -1,4 +0,0 @@
1
- openai==1.6.1
2
- python-dotenv==1.0.0
3
- PyPDF2==3.0.1
4
- langchain==0.0.352
 
 
 
 
 
src/__init__.py DELETED
@@ -1 +0,0 @@
1
-
 
 
src/pdf_processor.py DELETED
@@ -1,112 +0,0 @@
1
- """
2
- PDF processing module for extracting and chunking text from PDF documents.
3
- """
4
- import logging
5
- from typing import List, Tuple
6
- import PyPDF2
7
- from io import BytesIO
8
-
9
- from app.config import MAX_CHUNK_SIZE, OVERLAP_SIZE
10
-
11
- logger = logging.getLogger('pdf')
12
-
13
- class PDFProcessor:
14
- """Handles PDF document processing and text chunking."""
15
-
16
- @staticmethod
17
- def extract_text(pdf_file: BytesIO) -> str:
18
- """Extract text content from a PDF file."""
19
- try:
20
- pdf_reader = PyPDF2.PdfReader(pdf_file)
21
- text = ""
22
-
23
- for page in pdf_reader.pages:
24
- text += page.extract_text() + "\n"
25
-
26
- logger.info(f"Successfully extracted text from PDF ({len(text)} characters)")
27
- return text
28
-
29
- except Exception as e:
30
- logger.error(f"Error extracting text from PDF: {str(e)}")
31
- raise
32
-
33
- @staticmethod
34
- def create_chunks(text: str, chunk_size: int = MAX_CHUNK_SIZE,
35
- overlap: int = OVERLAP_SIZE) -> List[Tuple[str, dict]]:
36
- """Split text into overlapping chunks with metadata."""
37
- try:
38
- chunks = []
39
- start = 0
40
-
41
- while start < len(text):
42
- # Find the end of the chunk
43
- end = start + chunk_size
44
-
45
- # If we're not at the end of the text, try to find a good break point
46
- if end < len(text):
47
- # Try to find the last period or newline in the chunk
48
- last_period = text.rfind('.', start, end)
49
- last_newline = text.rfind('\n', start, end)
50
- break_point = max(last_period, last_newline)
51
-
52
- if break_point > start:
53
- end = break_point + 1
54
-
55
- # Create chunk with metadata
56
- chunk_text = text[start:end].strip()
57
- if chunk_text: # Only add non-empty chunks
58
- metadata = {
59
- "start_char": start,
60
- "end_char": end,
61
- "chunk_size": len(chunk_text)
62
- }
63
- chunks.append((chunk_text, metadata))
64
-
65
- # Move the start position, accounting for overlap
66
- start = end - overlap if end < len(text) else len(text)
67
-
68
- logger.info(f"Created {len(chunks)} chunks from text")
69
- return chunks
70
-
71
- except Exception as e:
72
- logger.error(f"Error creating chunks: {str(e)}")
73
- raise
74
-
75
- @staticmethod
76
- def clean_text(text: str) -> str:
77
- """Clean and normalize extracted text."""
78
- try:
79
- # Remove extra whitespace
80
- text = ' '.join(text.split())
81
-
82
- # Remove special characters that might cause issues
83
- text = text.replace('\x00', '')
84
-
85
- # Normalize newlines
86
- text = text.replace('\r\n', '\n')
87
-
88
- logger.info("Text cleaned successfully")
89
- return text
90
-
91
- except Exception as e:
92
- logger.error(f"Error cleaning text: {str(e)}")
93
- raise
94
-
95
- def process_pdf(self, pdf_file: BytesIO) -> List[Tuple[str, dict]]:
96
- """Process PDF file and return chunks with metadata."""
97
- try:
98
- # Extract text from PDF
99
- raw_text = self.extract_text(pdf_file)
100
-
101
- # Clean the extracted text
102
- cleaned_text = self.clean_text(raw_text)
103
-
104
- # Create chunks
105
- chunks = self.create_chunks(cleaned_text)
106
-
107
- logger.info(f"PDF processed successfully: {len(chunks)} chunks created")
108
- return chunks
109
-
110
- except Exception as e:
111
- logger.error(f"Error processing PDF: {str(e)}")
112
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/rag_engine.py DELETED
@@ -1,131 +0,0 @@
1
- """
2
- RAG (Retrieval Augmented Generation) engine for the CRE Chatbot.
3
- """
4
- import logging
5
- import os
6
- from typing import List, Dict, Any, Optional
7
-
8
- import chromadb
9
- from chromadb.config import Settings
10
- from openai import AzureOpenAI
11
- from app.config import (
12
- AZURE_OPENAI_ENDPOINT,
13
- AZURE_OPENAI_API_KEY, # Added this line
14
- TEMPERATURE,
15
- MAX_TOKENS,
16
- AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME
17
- )
18
-
19
- logger = logging.getLogger('rag')
20
-
21
- class RAGEngine:
22
- """Handles document retrieval and question answering using Azure OpenAI."""
23
-
24
- def __init__(self, deployment_name: str):
25
- """Initialize the RAG engine with Azure OpenAI client."""
26
- self.client = AzureOpenAI(
27
- api_key=AZURE_OPENAI_API_KEY,
28
- api_version="2023-12-01-preview",
29
- azure_endpoint=AZURE_OPENAI_ENDPOINT
30
- )
31
- self.deployment_name = deployment_name
32
- self.embedding_deployment_name = AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME
33
-
34
- # Initialize ChromaDB with simple in-memory settings
35
- self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
36
- self.collection = None
37
- self.initialize_vector_store("cre_docs")
38
- logger.info("RAG Engine initialized with Azure OpenAI")
39
-
40
- def create_embeddings(self, texts: List[str]) -> List[List[float]]:
41
- """Create embeddings for the given texts using Azure OpenAI."""
42
- try:
43
- response = self.client.embeddings.create(
44
- input=texts,
45
- model=self.embedding_deployment_name
46
- )
47
- return [item.embedding for item in response.data]
48
- except Exception as e:
49
- logger.error(f"Error creating embeddings: {str(e)}")
50
- raise
51
-
52
- def initialize_vector_store(self, collection_name: str):
53
- """Initialize or get the vector store collection."""
54
- try:
55
- self.collection = self.chroma_client.get_or_create_collection(
56
- name=collection_name,
57
- metadata={"hnsw:space": "cosine"}
58
- )
59
- logger.info(f"Vector store initialized with collection: {collection_name}")
60
- except Exception as e:
61
- logger.error(f"Error initializing vector store: {str(e)}")
62
- raise
63
-
64
- def add_documents(self, texts: List[str], metadata: Optional[List[Dict[str, Any]]] = None):
65
- """Add documents to the vector store."""
66
- try:
67
- if not self.collection:
68
- raise ValueError("Vector store collection not initialized")
69
-
70
- embeddings = self.create_embeddings(texts)
71
- # Use timestamp + index as ID to ensure uniqueness
72
- import time
73
- timestamp = int(time.time())
74
- ids = [f"{timestamp}_{i}" for i in range(len(texts))]
75
-
76
- self.collection.add(
77
- embeddings=embeddings,
78
- documents=texts,
79
- ids=ids,
80
- metadatas=metadata if metadata else [{}] * len(texts)
81
- )
82
- logger.info(f"Added {len(texts)} documents to vector store")
83
- except Exception as e:
84
- logger.error(f"Error adding documents: {str(e)}")
85
- raise
86
-
87
- def query(self, question: str, k: int = 3) -> Dict[str, Any]:
88
- """Query the vector store and generate an answer."""
89
- try:
90
- # Create embedding for the question
91
- question_embedding = self.create_embeddings([question])[0]
92
-
93
- # Query vector store
94
- results = self.collection.query(
95
- query_embeddings=[question_embedding],
96
- n_results=k
97
- )
98
-
99
- # Prepare context from retrieved documents
100
- context = "\n".join(results['documents'][0])
101
-
102
- # Generate answer using Azure OpenAI
103
- messages = [
104
- {"role": "system", "content": "You are a helpful assistant that answers questions about commercial real estate concepts. Use the provided context to answer questions accurately and concisely."},
105
- {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}
106
- ]
107
-
108
- response = self.client.chat.completions.create(
109
- model=self.deployment_name,
110
- messages=messages,
111
- temperature=TEMPERATURE,
112
- max_tokens=MAX_TOKENS
113
- )
114
-
115
- answer = response.choices[0].message.content
116
-
117
- return {
118
- "answer": answer,
119
- "context": context,
120
- "source_documents": results['documents'][0]
121
- }
122
-
123
- except Exception as e:
124
- logger.error(f"Error querying RAG engine: {str(e)}")
125
- raise
126
-
127
- def clear(self):
128
- """Clear the vector store collection."""
129
- if self.collection:
130
- self.collection.delete()
131
- logger.info("Vector store collection cleared")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
startup.sh DELETED
@@ -1,2 +0,0 @@
1
- #!/bin/sh
2
- streamlit run app/main.py --server.port 8000 --server.address 0.0.0.0
 
 
 
staticwebapp.config.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "routes": [
3
- {
4
- "route": "/api/*",
5
- "serve": "/api",
6
- "methods": ["GET", "POST"]
7
- },
8
- {
9
- "route": "/*",
10
- "serve": "/",
11
- "statusCode": 200
12
- }
13
- ],
14
- "navigationFallback": {
15
- "rewrite": "/index.html"
16
- },
17
- "platform": {
18
- "apiRuntime": "python:3.11"
19
- },
20
- "globalHeaders": {
21
- "Content-Security-Policy": "default-src * 'unsafe-inline' 'unsafe-eval' data: blob:;",
22
- "Access-Control-Allow-Origin": "*"
23
- },
24
- "buildProperties": {
25
- "appLocation": "/frontend",
26
- "apiLocation": "/api",
27
- "outputLocation": "",
28
- "apiBuildCommand": "pip install -r requirements.txt",
29
- "appBuildCommand": "pip install -r requirements.txt"
30
- }
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
streamlit_app.py DELETED
@@ -1,43 +0,0 @@
1
- import streamlit as st
2
- import os
3
- from dotenv import load_dotenv
4
- from pdf_processor import PDFProcessor
5
- from rag_engine import RAGEngine
6
- from app.config import AZURE_OPENAI_DEPLOYMENT_NAME
7
-
8
- # Load environment variables
9
- load_dotenv()
10
-
11
- # Initialize components
12
- pdf_processor = PDFProcessor()
13
- rag_engine = RAGEngine(deployment_name=AZURE_OPENAI_DEPLOYMENT_NAME)
14
-
15
- def main():
16
- st.set_page_config(
17
- page_title="CRE Knowledge Assistant",
18
- page_icon="🤖",
19
- layout="wide"
20
- )
21
-
22
- st.title("CRE Knowledge Assistant 🏢")
23
-
24
- # File uploader
25
- uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
26
-
27
- if uploaded_file:
28
- try:
29
- # Process the PDF
30
- pdf_processor.process(uploaded_file)
31
- st.success("PDF processed successfully! You can now ask questions about it.")
32
-
33
- # Show chat interface
34
- user_question = st.text_input("Ask a question about the document:")
35
- if user_question:
36
- response = rag_engine.get_response(user_question)
37
- st.write("Answer:", response)
38
-
39
- except Exception as e:
40
- st.error(f"Error processing PDF: {str(e)}")
41
-
42
- if __name__ == "__main__":
43
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/__init__.py DELETED
@@ -1 +0,0 @@
1
-
 
 
tests/test_pdf_processor.py DELETED
@@ -1,73 +0,0 @@
1
- """
2
- Tests for the PDF processor module.
3
- """
4
- import pytest
5
- from io import BytesIO
6
- from src.pdf_processor import PDFProcessor
7
-
8
- def test_clean_text():
9
- """Test text cleaning functionality."""
10
- processor = PDFProcessor()
11
-
12
- # Test removing extra whitespace
13
- text = "This has extra spaces"
14
- assert processor.clean_text(text) == "This has extra spaces"
15
-
16
- # Test normalizing newlines
17
- text = "Line1\r\nLine2\r\nLine3"
18
- assert processor.clean_text(text) == "Line1 Line2 Line3"
19
-
20
- # Test removing null characters
21
- text = "Text with\x00null\x00chars"
22
- assert processor.clean_text(text) == "Text with null chars"
23
-
24
- def test_create_chunks():
25
- """Test text chunking functionality."""
26
- processor = PDFProcessor()
27
-
28
- # Test basic chunking
29
- text = "This is a test. This is another test. And a final test."
30
- chunks = processor.create_chunks(text, chunk_size=20, overlap=5)
31
-
32
- assert len(chunks) > 0
33
- assert all(isinstance(chunk, tuple) for chunk in chunks)
34
- assert all(len(chunk) == 2 for chunk in chunks) # (text, metadata)
35
- assert all(isinstance(chunk[1], dict) for chunk in chunks) # metadata is dict
36
-
37
- def test_chunk_metadata():
38
- """Test chunk metadata creation."""
39
- processor = PDFProcessor()
40
-
41
- text = "Short test text."
42
- chunks = processor.create_chunks(text, chunk_size=20, overlap=5)
43
-
44
- assert len(chunks) == 1
45
- chunk_text, metadata = chunks[0]
46
-
47
- assert "start_char" in metadata
48
- assert "end_char" in metadata
49
- assert "chunk_size" in metadata
50
- assert metadata["chunk_size"] == len(chunk_text)
51
-
52
- def test_empty_text():
53
- """Test handling of empty text."""
54
- processor = PDFProcessor()
55
-
56
- chunks = processor.create_chunks("")
57
- assert len(chunks) == 0
58
-
59
- def test_chunk_overlap():
60
- """Test chunk overlap functionality."""
61
- processor = PDFProcessor()
62
-
63
- text = "This is a long text that should be split into multiple chunks with overlap."
64
- chunks = processor.create_chunks(text, chunk_size=20, overlap=5)
65
-
66
- # Check that chunks overlap
67
- if len(chunks) > 1:
68
- for i in range(len(chunks) - 1):
69
- current_chunk = chunks[i][0]
70
- next_chunk = chunks[i + 1][0]
71
-
72
- # There should be some overlap between consecutive chunks
73
- assert any(word in next_chunk for word in current_chunk.split()[-3:])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_rag_engine.py DELETED
@@ -1,112 +0,0 @@
1
- """
2
- Tests for the RAG engine module.
3
- """
4
- import pytest
5
- from unittest.mock import Mock, patch
6
- from src.rag_engine import RAGEngine
7
-
8
- @pytest.fixture
9
- def mock_azure_client():
10
- """Create a mock Azure OpenAI client."""
11
- with patch('openai.AzureOpenAI') as mock_client:
12
- yield mock_client
13
-
14
- @pytest.fixture
15
- def mock_chroma_client():
16
- """Create a mock Chroma client."""
17
- with patch('chromadb.Client') as mock_client:
18
- yield mock_client
19
-
20
- @pytest.fixture
21
- def rag_engine(mock_azure_client, mock_chroma_client):
22
- """Create a RAG engine instance with mocked dependencies."""
23
- return RAGEngine("test-deployment")
24
-
25
- def test_create_embeddings(rag_engine, mock_azure_client):
26
- """Test embedding creation."""
27
- # Setup mock response
28
- mock_response = Mock()
29
- mock_response.data = [
30
- Mock(embedding=[0.1, 0.2, 0.3]),
31
- Mock(embedding=[0.4, 0.5, 0.6])
32
- ]
33
- rag_engine.client.embeddings.create.return_value = mock_response
34
-
35
- # Test
36
- texts = ["Text 1", "Text 2"]
37
- embeddings = rag_engine.create_embeddings(texts)
38
-
39
- # Verify
40
- assert len(embeddings) == 2
41
- assert all(isinstance(emb, list) for emb in embeddings)
42
- assert len(embeddings[0]) == 3 # Embedding dimension
43
-
44
- def test_initialize_vector_store(rag_engine):
45
- """Test vector store initialization."""
46
- rag_engine.initialize_vector_store("test_collection")
47
-
48
- # Verify the collection was created
49
- assert rag_engine.collection is not None
50
-
51
- def test_add_documents(rag_engine):
52
- """Test adding documents to vector store."""
53
- # Setup
54
- rag_engine.initialize_vector_store("test_collection")
55
- texts = ["Document 1", "Document 2"]
56
- metadata = [{"source": "test1"}, {"source": "test2"}]
57
-
58
- # Create mock embeddings
59
- with patch.object(rag_engine, 'create_embeddings') as mock_create_embeddings:
60
- mock_create_embeddings.return_value = [[0.1, 0.2], [0.3, 0.4]]
61
-
62
- # Test
63
- rag_engine.add_documents(texts, metadata)
64
-
65
- # Verify
66
- mock_create_embeddings.assert_called_once_with(texts)
67
- assert rag_engine.collection.add.called
68
-
69
- def test_query(rag_engine):
70
- """Test querying the RAG engine."""
71
- # Setup
72
- rag_engine.initialize_vector_store("test_collection")
73
-
74
- # Mock embeddings creation
75
- with patch.object(rag_engine, 'create_embeddings') as mock_create_embeddings:
76
- mock_create_embeddings.return_value = [[0.1, 0.2]]
77
-
78
- # Mock vector store query
79
- mock_results = {
80
- 'documents': [["Relevant document 1", "Relevant document 2"]],
81
- 'distances': [[0.1, 0.2]]
82
- }
83
- rag_engine.collection.query.return_value = mock_results
84
-
85
- # Mock chat completion
86
- mock_response = Mock()
87
- mock_response.choices = [Mock(message=Mock(content="Test answer"))]
88
- rag_engine.client.chat.completions.create.return_value = mock_response
89
-
90
- # Test
91
- result = rag_engine.query("Test question")
92
-
93
- # Verify
94
- assert isinstance(result, dict)
95
- assert "answer" in result
96
- assert "context" in result
97
- assert "source_documents" in result
98
- assert result["answer"] == "Test answer"
99
-
100
- def test_error_handling(rag_engine):
101
- """Test error handling in RAG engine."""
102
- # Test error in embeddings creation
103
- rag_engine.client.embeddings.create.side_effect = Exception("API Error")
104
-
105
- with pytest.raises(Exception):
106
- rag_engine.create_embeddings(["Test"])
107
-
108
- # Test error in vector store initialization
109
- rag_engine.chroma_client.get_or_create_collection.side_effect = Exception("DB Error")
110
-
111
- with pytest.raises(Exception):
112
- rag_engine.initialize_vector_store("test")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vercel.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "builds": [
3
- {
4
- "src": "app.py",
5
- "use": "@vercel/python"
6
- }
7
- ],
8
- "routes": [
9
- {
10
- "src": "/(.*)",
11
- "dest": "app.py"
12
- }
13
- ]
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vercel.txt DELETED
@@ -1,7 +0,0 @@
1
- streamlit==1.29.0
2
- openai==1.6.1
3
- python-dotenv==1.0.0
4
- PyPDF2==3.0.1
5
- langchain==0.0.352
6
- chromadb==0.3.26
7
- pydantic==1.10.13