Spaces:
Sleeping
Sleeping
Commit
·
4ad48e3
1
Parent(s):
0169adb
Add core configuration files
Browse files- .deployment +0 -3
- .devcontainer/devcontainer.json +0 -33
- .dockerignore +0 -9
- .gitattributes +0 -1
- .github/workflows/huggingface-spaces-sync.yml +0 -30
- .gitignore +0 -35
- .streamlit/config.toml +0 -9
- Dataset/Commercial Lending 101.pdf +0 -3
- Dockerfile +0 -16
- README.md +0 -122
- api/__init__.py +0 -8
- api/function_app.py +0 -71
- api/requirements.txt +0 -7
- app.py +0 -116
- app/__init__.py +0 -1
- app/config.py +0 -45
- app/logging.py +0 -59
- app/main.py +0 -209
- docker-compose.yml +0 -15
- frontend/main.py +0 -59
- frontend/requirements.txt +0 -3
- index.html +0 -30
- package-lock.json +0 -0
- package.json +0 -21
- pdf_processor.py +0 -42
- rag_engine.py +0 -112
- railway.toml +0 -8
- requirements-dev.txt +0 -20
- requirements.txt +0 -13
- requirements_api.txt +0 -4
- src/__init__.py +0 -1
- src/pdf_processor.py +0 -112
- src/rag_engine.py +0 -131
- startup.sh +0 -2
- staticwebapp.config.json +0 -31
- streamlit_app.py +0 -43
- tests/__init__.py +0 -1
- tests/test_pdf_processor.py +0 -73
- tests/test_rag_engine.py +0 -112
- vercel.json +0 -14
- vercel.txt +0 -7
.deployment
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
[config]
|
2 |
-
SCM_DO_BUILD_DURING_DEPLOYMENT=true
|
3 |
-
PYTHON_ENABLE_GUNICORN=false
|
|
|
|
|
|
|
|
.devcontainer/devcontainer.json
DELETED
@@ -1,33 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"name": "Python 3",
|
3 |
-
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
|
4 |
-
"image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
|
5 |
-
"customizations": {
|
6 |
-
"codespaces": {
|
7 |
-
"openFiles": [
|
8 |
-
"README.md",
|
9 |
-
"streamlit_app.py"
|
10 |
-
]
|
11 |
-
},
|
12 |
-
"vscode": {
|
13 |
-
"settings": {},
|
14 |
-
"extensions": [
|
15 |
-
"ms-python.python",
|
16 |
-
"ms-python.vscode-pylance"
|
17 |
-
]
|
18 |
-
}
|
19 |
-
},
|
20 |
-
"updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
|
21 |
-
"postAttachCommand": {
|
22 |
-
"server": "streamlit run streamlit_app.py --server.enableCORS false --server.enableXsrfProtection false"
|
23 |
-
},
|
24 |
-
"portsAttributes": {
|
25 |
-
"8501": {
|
26 |
-
"label": "Application",
|
27 |
-
"onAutoForward": "openPreview"
|
28 |
-
}
|
29 |
-
},
|
30 |
-
"forwardPorts": [
|
31 |
-
8501
|
32 |
-
]
|
33 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.dockerignore
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
.git
|
2 |
-
.gitignore
|
3 |
-
.env
|
4 |
-
__pycache__
|
5 |
-
*.pyc
|
6 |
-
vector_store/
|
7 |
-
venv/
|
8 |
-
.pytest_cache/
|
9 |
-
logs/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitattributes
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
*.pdf filter=lfs diff=lfs merge=lfs -text
|
|
|
|
.github/workflows/huggingface-spaces-sync.yml
DELETED
@@ -1,30 +0,0 @@
|
|
1 |
-
name: Sync to Hugging Face Spaces
|
2 |
-
on:
|
3 |
-
push:
|
4 |
-
branches: [main]
|
5 |
-
|
6 |
-
jobs:
|
7 |
-
sync:
|
8 |
-
runs-on: ubuntu-latest
|
9 |
-
steps:
|
10 |
-
- uses: actions/checkout@v3
|
11 |
-
with:
|
12 |
-
fetch-depth: 0
|
13 |
-
lfs: true
|
14 |
-
|
15 |
-
- name: Setup Git LFS
|
16 |
-
run: |
|
17 |
-
curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
|
18 |
-
sudo apt-get install git-lfs
|
19 |
-
git lfs install
|
20 |
-
|
21 |
-
- name: Push to Hugging Face Spaces
|
22 |
-
env:
|
23 |
-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
24 |
-
run: |
|
25 |
-
git config --global user.email "[email protected]"
|
26 |
-
git config --global user.name "GitHub Actions"
|
27 |
-
git remote add hf https://tony-42069:[email protected]/spaces/tony-42069/cre-chatbot-rag
|
28 |
-
git fetch hf
|
29 |
-
git lfs push --all hf main
|
30 |
-
git push -f hf main
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
# Next.js
|
2 |
-
.next/
|
3 |
-
node_modules/
|
4 |
-
out/
|
5 |
-
|
6 |
-
# Virtual environment
|
7 |
-
venv/
|
8 |
-
env/
|
9 |
-
ENV/
|
10 |
-
|
11 |
-
# Python
|
12 |
-
__pycache__/
|
13 |
-
*.py[cod]
|
14 |
-
*$py.class
|
15 |
-
|
16 |
-
# Distribution / packaging
|
17 |
-
dist/
|
18 |
-
build/
|
19 |
-
*.egg-info/
|
20 |
-
|
21 |
-
# Local development settings
|
22 |
-
.env
|
23 |
-
.env.local
|
24 |
-
|
25 |
-
# IDE
|
26 |
-
.vscode/
|
27 |
-
.idea/
|
28 |
-
|
29 |
-
# Operating System
|
30 |
-
.DS_Store
|
31 |
-
Thumbs.db
|
32 |
-
|
33 |
-
# Misc
|
34 |
-
*.pem
|
35 |
-
.vercel
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.streamlit/config.toml
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
[theme]
|
2 |
-
primaryColor = "#FF4B4B"
|
3 |
-
backgroundColor = "#FFFFFF"
|
4 |
-
secondaryBackgroundColor = "#F0F2F6"
|
5 |
-
textColor = "#262730"
|
6 |
-
font = "sans serif"
|
7 |
-
|
8 |
-
[server]
|
9 |
-
maxUploadSize = 200
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dataset/Commercial Lending 101.pdf
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:46f8deca30acd6c8b45ca371babf4bbcd1848916f09c088a33e2edcf46164746
|
3 |
-
size 6879185
|
|
|
|
|
|
|
|
Dockerfile
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
FROM python:3.11-slim
|
2 |
-
|
3 |
-
WORKDIR /app
|
4 |
-
|
5 |
-
COPY requirements.txt .
|
6 |
-
RUN pip install -r requirements.txt
|
7 |
-
|
8 |
-
COPY . .
|
9 |
-
|
10 |
-
# Make port configurable via environment variable
|
11 |
-
ENV PORT=8501
|
12 |
-
|
13 |
-
EXPOSE ${PORT}
|
14 |
-
|
15 |
-
# Use the correct path to app.py and make port configurable
|
16 |
-
ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=${PORT}", "--server.address=0.0.0.0"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
DELETED
@@ -1,122 +0,0 @@
|
|
1 |
-
# Commercial Real Estate Knowledge Assistant
|
2 |
-
|
3 |
-
![Commercial Lending 101](Dataset/commercial-lending-101.png)
|
4 |
-
|
5 |
-
A sophisticated Retrieval-Augmented Generation (RAG) chatbot that transforms how professionals understand commercial real estate concepts. Built with Azure OpenAI and modern Python technologies, this assistant processes commercial real estate documentation and provides accurate, context-aware answers to your questions.
|
6 |
-
|
7 |
-
## 🚀 Deployments
|
8 |
-
- **Live Demo**: [Try it on Hugging Face Spaces](https://huggingface.co/spaces/tony-42069/cre-chatbot-rag)
|
9 |
-
|
10 |
-
## 🌟 Key Features
|
11 |
-
- **Multi-Document Support**: Process and analyze multiple PDF documents simultaneously
|
12 |
-
- **Intelligent PDF Processing**: Advanced document analysis and text extraction
|
13 |
-
- **Azure OpenAI Integration**: Leveraging GPT-3.5 Turbo for accurate, contextual responses
|
14 |
-
- **Semantic Search**: Using Azure OpenAI embeddings for precise context retrieval
|
15 |
-
- **Vector Storage**: Efficient document indexing with ChromaDB
|
16 |
-
- **Modern UI**: Beautiful chat interface with message history and source tracking
|
17 |
-
- **Enterprise-Ready**: Comprehensive logging and error handling
|
18 |
-
|
19 |
-
## 🎯 Use Cases
|
20 |
-
- **Training & Education**: Help new CRE professionals understand industry concepts
|
21 |
-
- **Quick Reference**: Instant access to definitions and explanations
|
22 |
-
- **Document Analysis**: Extract insights from CRE documentation
|
23 |
-
- **Knowledge Base**: Build and query your own CRE knowledge repository
|
24 |
-
|
25 |
-
## 🚀 Quick Start
|
26 |
-
|
27 |
-
### Prerequisites
|
28 |
-
- Python 3.8+
|
29 |
-
- Azure OpenAI Service access with:
|
30 |
-
- `gpt-35-turbo` model deployment
|
31 |
-
- `text-embedding-ada-002` model deployment
|
32 |
-
|
33 |
-
### Installation
|
34 |
-
1. Clone the repository:
|
35 |
-
```bash
|
36 |
-
git clone https://github.com/tony-42069/cre-chatbot-rag.git
|
37 |
-
cd cre-chatbot-rag
|
38 |
-
```
|
39 |
-
|
40 |
-
2. Create and activate virtual environment:
|
41 |
-
```bash
|
42 |
-
python -m venv venv
|
43 |
-
venv\Scripts\activate
|
44 |
-
```
|
45 |
-
|
46 |
-
3. Install dependencies:
|
47 |
-
```bash
|
48 |
-
pip install -r requirements.txt
|
49 |
-
```
|
50 |
-
|
51 |
-
4. Create `.env` file with Azure OpenAI credentials:
|
52 |
-
```env
|
53 |
-
AZURE_OPENAI_ENDPOINT=your_endpoint_here
|
54 |
-
AZURE_OPENAI_KEY=your_key_here
|
55 |
-
AZURE_OPENAI_DEPLOYMENT_NAME=your_gpt_deployment_name
|
56 |
-
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME=text-embedding-ada-002
|
57 |
-
```
|
58 |
-
|
59 |
-
5. Run the application:
|
60 |
-
```bash
|
61 |
-
streamlit run app/main.py
|
62 |
-
```
|
63 |
-
|
64 |
-
## 🔌 Embedding
|
65 |
-
To embed this chatbot in your website, use the following HTML code:
|
66 |
-
|
67 |
-
```html
|
68 |
-
<iframe
|
69 |
-
src="https://tony-42069-cre-chatbot-rag.hf.space"
|
70 |
-
frameborder="0"
|
71 |
-
width="850px"
|
72 |
-
height="450px"
|
73 |
-
></iframe>
|
74 |
-
```
|
75 |
-
|
76 |
-
## 💡 Features
|
77 |
-
|
78 |
-
### Modern Chat Interface
|
79 |
-
- Clean, professional design
|
80 |
-
- Persistent chat history
|
81 |
-
- Source context tracking
|
82 |
-
- Multiple document management
|
83 |
-
- Real-time processing feedback
|
84 |
-
|
85 |
-
### Advanced RAG Implementation
|
86 |
-
- Semantic chunking of documents
|
87 |
-
- Azure OpenAI embeddings for accurate retrieval
|
88 |
-
- Context-aware answer generation
|
89 |
-
- Multi-document knowledge base
|
90 |
-
- Source attribution for answers
|
91 |
-
|
92 |
-
### Enterprise Security
|
93 |
-
- Secure credential management
|
94 |
-
- Azure OpenAI integration
|
95 |
-
- Local vector storage with ChromaDB
|
96 |
-
- Comprehensive error handling
|
97 |
-
- Detailed logging system
|
98 |
-
|
99 |
-
## 🛠️ Technical Stack
|
100 |
-
- **Frontend**: Streamlit
|
101 |
-
- **Language Models**: Azure OpenAI (GPT-3.5 Turbo)
|
102 |
-
- **Embeddings**: Azure OpenAI (text-embedding-ada-002)
|
103 |
-
- **Vector Store**: ChromaDB
|
104 |
-
- **PDF Processing**: PyPDF2
|
105 |
-
- **Framework**: LangChain
|
106 |
-
|
107 |
-
## 📚 Documentation
|
108 |
-
- [Azure OpenAI Service](https://azure.microsoft.com/en-us/products/cognitive-services/openai-service/)
|
109 |
-
- [Streamlit](https://streamlit.io/)
|
110 |
-
- [LangChain](https://python.langchain.com/)
|
111 |
-
- [ChromaDB](https://www.trychroma.com/)
|
112 |
-
|
113 |
-
## 🤝 Contributing
|
114 |
-
Contributions are welcome! Please feel free to submit a Pull Request.
|
115 |
-
|
116 |
-
## 📄 License
|
117 |
-
This project is licensed under the MIT License - see the LICENSE file for details.
|
118 |
-
|
119 |
-
## 🙏 Acknowledgments
|
120 |
-
- Azure OpenAI team for providing the powerful language models
|
121 |
-
- LangChain community for the excellent RAG framework
|
122 |
-
- Streamlit team for the amazing web framework
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
api/__init__.py
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
import azure.functions as func
|
2 |
-
import streamlit as st
|
3 |
-
|
4 |
-
def main(req: func.HttpRequest) -> func.HttpResponse:
|
5 |
-
return func.HttpResponse(
|
6 |
-
"This is the API endpoint for the CRE Knowledge Assistant",
|
7 |
-
status_code=200
|
8 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
api/function_app.py
DELETED
@@ -1,71 +0,0 @@
|
|
1 |
-
import azure.functions as func
|
2 |
-
import logging
|
3 |
-
import json
|
4 |
-
from io import BytesIO
|
5 |
-
|
6 |
-
# Add the project root to Python path
|
7 |
-
import sys
|
8 |
-
import os
|
9 |
-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
10 |
-
|
11 |
-
from app.config import validate_config
|
12 |
-
from app.logging import setup_logging
|
13 |
-
from src.pdf_processor import PDFProcessor
|
14 |
-
from src.rag_engine import RAGEngine
|
15 |
-
|
16 |
-
# Initialize components
|
17 |
-
setup_logging()
|
18 |
-
logger = logging.getLogger('app')
|
19 |
-
pdf_processor = PDFProcessor()
|
20 |
-
rag_engine = RAGEngine()
|
21 |
-
|
22 |
-
def process_pdf(req: func.HttpRequest) -> func.HttpResponse:
|
23 |
-
try:
|
24 |
-
# Get the PDF file from the request
|
25 |
-
pdf_file = req.files['file']
|
26 |
-
pdf_bytes = pdf_file.read()
|
27 |
-
|
28 |
-
# Process the PDF
|
29 |
-
pdf_processor.process(BytesIO(pdf_bytes))
|
30 |
-
|
31 |
-
return func.HttpResponse(
|
32 |
-
json.dumps({"message": "PDF processed successfully"}),
|
33 |
-
mimetype="application/json",
|
34 |
-
status_code=200
|
35 |
-
)
|
36 |
-
except Exception as e:
|
37 |
-
logger.error(f"Error processing PDF: {str(e)}")
|
38 |
-
return func.HttpResponse(
|
39 |
-
json.dumps({"error": str(e)}),
|
40 |
-
mimetype="application/json",
|
41 |
-
status_code=500
|
42 |
-
)
|
43 |
-
|
44 |
-
def query(req: func.HttpRequest) -> func.HttpResponse:
|
45 |
-
try:
|
46 |
-
# Get the query from request body
|
47 |
-
req_body = req.get_json()
|
48 |
-
user_query = req_body.get('query')
|
49 |
-
|
50 |
-
if not user_query:
|
51 |
-
return func.HttpResponse(
|
52 |
-
json.dumps({"error": "No query provided"}),
|
53 |
-
mimetype="application/json",
|
54 |
-
status_code=400
|
55 |
-
)
|
56 |
-
|
57 |
-
# Process query through RAG engine
|
58 |
-
answer = rag_engine.process_query(user_query)
|
59 |
-
|
60 |
-
return func.HttpResponse(
|
61 |
-
json.dumps({"answer": answer}),
|
62 |
-
mimetype="application/json",
|
63 |
-
status_code=200
|
64 |
-
)
|
65 |
-
except Exception as e:
|
66 |
-
logger.error(f"Error processing query: {str(e)}")
|
67 |
-
return func.HttpResponse(
|
68 |
-
json.dumps({"error": str(e)}),
|
69 |
-
mimetype="application/json",
|
70 |
-
status_code=500
|
71 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
api/requirements.txt
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
azure-functions==1.15.0
|
2 |
-
openai==1.6.1
|
3 |
-
python-dotenv==1.0.0
|
4 |
-
azure-cognitiveservices-language-textanalytics==0.2.0
|
5 |
-
PyPDF2==3.0.1
|
6 |
-
langchain==0.0.352
|
7 |
-
azure-storage-blob==12.19.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
DELETED
@@ -1,116 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import tempfile
|
3 |
-
import os
|
4 |
-
from pdf_processor import PDFProcessor
|
5 |
-
from rag_engine import RAGEngine
|
6 |
-
|
7 |
-
# Initialize session state
|
8 |
-
if 'rag_engine' not in st.session_state:
|
9 |
-
try:
|
10 |
-
st.session_state.rag_engine = RAGEngine()
|
11 |
-
except ValueError as e:
|
12 |
-
st.error(f"Configuration Error: {str(e)}")
|
13 |
-
st.stop()
|
14 |
-
except ConnectionError as e:
|
15 |
-
st.error(f"Connection Error: {str(e)}")
|
16 |
-
st.stop()
|
17 |
-
except Exception as e:
|
18 |
-
st.error(f"Unexpected Error: {str(e)}")
|
19 |
-
st.stop()
|
20 |
-
|
21 |
-
if 'processed_file' not in st.session_state:
|
22 |
-
st.session_state.processed_file = False
|
23 |
-
|
24 |
-
# Page config
|
25 |
-
st.set_page_config(page_title="Concept Definition Chatbot", layout="wide")
|
26 |
-
st.title("Concept Definition Chatbot")
|
27 |
-
|
28 |
-
# Sidebar for PDF upload
|
29 |
-
with st.sidebar:
|
30 |
-
st.header("Upload PDF")
|
31 |
-
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
|
32 |
-
|
33 |
-
if uploaded_file is not None and not st.session_state.processed_file:
|
34 |
-
with st.spinner("Processing PDF..."):
|
35 |
-
try:
|
36 |
-
# Save uploaded file temporarily
|
37 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
|
38 |
-
tmp_file.write(uploaded_file.getvalue())
|
39 |
-
tmp_path = tmp_file.name
|
40 |
-
|
41 |
-
# Process PDF
|
42 |
-
processor = PDFProcessor()
|
43 |
-
chunks = processor.process_pdf(tmp_path)
|
44 |
-
|
45 |
-
# Initialize RAG engine
|
46 |
-
st.session_state.rag_engine.initialize_vector_store(chunks)
|
47 |
-
st.session_state.processed_file = True
|
48 |
-
|
49 |
-
# Clean up
|
50 |
-
os.unlink(tmp_path)
|
51 |
-
except ValueError as e:
|
52 |
-
st.error(f"Configuration Error: {str(e)}")
|
53 |
-
st.stop()
|
54 |
-
except ConnectionError as e:
|
55 |
-
st.error(f"Connection Error: {str(e)}")
|
56 |
-
st.stop()
|
57 |
-
except Exception as e:
|
58 |
-
st.error(f"Unexpected Error: {str(e)}")
|
59 |
-
st.stop()
|
60 |
-
st.success("PDF processed successfully!")
|
61 |
-
|
62 |
-
# Main chat interface
|
63 |
-
if st.session_state.processed_file:
|
64 |
-
# Initialize chat history
|
65 |
-
if "messages" not in st.session_state:
|
66 |
-
st.session_state.messages = []
|
67 |
-
|
68 |
-
# Display chat messages
|
69 |
-
for message in st.session_state.messages:
|
70 |
-
with st.chat_message(message["role"]):
|
71 |
-
st.markdown(message["content"])
|
72 |
-
if "sources" in message:
|
73 |
-
with st.expander("View Sources"):
|
74 |
-
for source in message["sources"]:
|
75 |
-
st.markdown(f"**Page {source['page']}:**\n{source['text']}")
|
76 |
-
|
77 |
-
# Chat input
|
78 |
-
if prompt := st.chat_input("Ask a question about the concepts in your PDF"):
|
79 |
-
# Add user message to chat history
|
80 |
-
st.session_state.messages.append({"role": "user", "content": prompt})
|
81 |
-
|
82 |
-
# Display user message
|
83 |
-
with st.chat_message("user"):
|
84 |
-
st.markdown(prompt)
|
85 |
-
|
86 |
-
# Get bot response
|
87 |
-
with st.chat_message("assistant"):
|
88 |
-
with st.spinner("Thinking..."):
|
89 |
-
try:
|
90 |
-
response = st.session_state.rag_engine.answer_question(prompt)
|
91 |
-
|
92 |
-
# Display response
|
93 |
-
st.markdown(response["answer"])
|
94 |
-
|
95 |
-
# Display sources in expander
|
96 |
-
with st.expander("View Sources"):
|
97 |
-
for source in response["sources"]:
|
98 |
-
st.markdown(f"**Page {source['page']}:**\n{source['text']}")
|
99 |
-
|
100 |
-
# Add assistant response to chat history
|
101 |
-
st.session_state.messages.append({
|
102 |
-
"role": "assistant",
|
103 |
-
"content": response["answer"],
|
104 |
-
"sources": response["sources"]
|
105 |
-
})
|
106 |
-
except ValueError as e:
|
107 |
-
st.error(f"Configuration Error: {str(e)}")
|
108 |
-
st.stop()
|
109 |
-
except ConnectionError as e:
|
110 |
-
st.error(f"Connection Error: {str(e)}")
|
111 |
-
st.stop()
|
112 |
-
except Exception as e:
|
113 |
-
st.error(f"Unexpected Error: {str(e)}")
|
114 |
-
st.stop()
|
115 |
-
else:
|
116 |
-
st.info("Please upload a PDF file to start chatting.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
|
|
|
|
app/config.py
DELETED
@@ -1,45 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Configuration management for the CRE Chatbot application.
|
3 |
-
"""
|
4 |
-
import os
|
5 |
-
from dotenv import load_dotenv
|
6 |
-
|
7 |
-
# Load environment variables
|
8 |
-
load_dotenv()
|
9 |
-
|
10 |
-
# Azure OpenAI Configuration
|
11 |
-
AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
|
12 |
-
AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_KEY')
|
13 |
-
AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME')
|
14 |
-
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME = os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME')
|
15 |
-
|
16 |
-
# Application Configuration
|
17 |
-
MAX_CHUNK_SIZE = 1000
|
18 |
-
OVERLAP_SIZE = 200
|
19 |
-
TEMPERATURE = 0.7
|
20 |
-
MAX_TOKENS = 500
|
21 |
-
|
22 |
-
# Logging Configuration
|
23 |
-
LOG_LEVEL = "INFO"
|
24 |
-
LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
25 |
-
LOG_FILE = "logs/app.log"
|
26 |
-
|
27 |
-
# Vector Store Configuration
|
28 |
-
VECTOR_STORE_PATH = "vector_store"
|
29 |
-
|
30 |
-
def validate_config():
|
31 |
-
"""Validate that all required configuration variables are set."""
|
32 |
-
required_vars = [
|
33 |
-
'AZURE_OPENAI_ENDPOINT',
|
34 |
-
'AZURE_OPENAI_API_KEY',
|
35 |
-
'AZURE_OPENAI_DEPLOYMENT_NAME',
|
36 |
-
'AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'
|
37 |
-
]
|
38 |
-
|
39 |
-
missing_vars = [var for var in required_vars if not os.getenv(var)]
|
40 |
-
|
41 |
-
if missing_vars:
|
42 |
-
raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
|
43 |
-
|
44 |
-
# Validate that all required configuration variables are set.
|
45 |
-
validate_config()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/logging.py
DELETED
@@ -1,59 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Logging configuration for the CRE Chatbot application.
|
3 |
-
"""
|
4 |
-
import logging
|
5 |
-
import os
|
6 |
-
from logging.handlers import RotatingFileHandler
|
7 |
-
from .config import LOG_LEVEL, LOG_FORMAT, LOG_FILE
|
8 |
-
|
9 |
-
def setup_logging():
|
10 |
-
"""Set up logging configuration for the application."""
|
11 |
-
# Create logs directory if it doesn't exist
|
12 |
-
os.makedirs('logs', exist_ok=True)
|
13 |
-
|
14 |
-
# Set up root logger
|
15 |
-
logger = logging.getLogger()
|
16 |
-
logger.setLevel(LOG_LEVEL)
|
17 |
-
|
18 |
-
# Create formatters and handlers
|
19 |
-
formatter = logging.Formatter(LOG_FORMAT)
|
20 |
-
|
21 |
-
# Console Handler
|
22 |
-
console_handler = logging.StreamHandler()
|
23 |
-
console_handler.setFormatter(formatter)
|
24 |
-
logger.addHandler(console_handler)
|
25 |
-
|
26 |
-
# File Handler
|
27 |
-
file_handler = RotatingFileHandler(
|
28 |
-
LOG_FILE,
|
29 |
-
maxBytes=10485760, # 10MB
|
30 |
-
backupCount=5
|
31 |
-
)
|
32 |
-
file_handler.setFormatter(formatter)
|
33 |
-
logger.addHandler(file_handler)
|
34 |
-
|
35 |
-
# Create separate loggers for different components
|
36 |
-
loggers = {
|
37 |
-
'api': setup_component_logger('api'),
|
38 |
-
'pdf': setup_component_logger('pdf'),
|
39 |
-
'rag': setup_component_logger('rag'),
|
40 |
-
'app': setup_component_logger('app')
|
41 |
-
}
|
42 |
-
|
43 |
-
return loggers
|
44 |
-
|
45 |
-
def setup_component_logger(name):
|
46 |
-
"""Set up a logger for a specific component."""
|
47 |
-
logger = logging.getLogger(name)
|
48 |
-
logger.setLevel(LOG_LEVEL)
|
49 |
-
|
50 |
-
# Create component-specific log file
|
51 |
-
handler = RotatingFileHandler(
|
52 |
-
f'logs/{name}.log',
|
53 |
-
maxBytes=10485760, # 10MB
|
54 |
-
backupCount=3
|
55 |
-
)
|
56 |
-
handler.setFormatter(logging.Formatter(LOG_FORMAT))
|
57 |
-
logger.addHandler(handler)
|
58 |
-
|
59 |
-
return logger
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/main.py
DELETED
@@ -1,209 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Main Streamlit application for the CRE Chatbot.
|
3 |
-
"""
|
4 |
-
import logging
|
5 |
-
import streamlit as st
|
6 |
-
from io import BytesIO
|
7 |
-
import sys
|
8 |
-
import os
|
9 |
-
|
10 |
-
# Add the project root to Python path
|
11 |
-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
12 |
-
|
13 |
-
from app.config import validate_config, AZURE_OPENAI_DEPLOYMENT_NAME
|
14 |
-
from app.logging import setup_logging
|
15 |
-
from src.pdf_processor import PDFProcessor
|
16 |
-
from src.rag_engine import RAGEngine
|
17 |
-
|
18 |
-
# Setup logging
|
19 |
-
loggers = setup_logging()
|
20 |
-
logger = logging.getLogger('app')
|
21 |
-
|
22 |
-
# Page configuration
|
23 |
-
st.set_page_config(
|
24 |
-
page_title="CRE Knowledge Assistant",
|
25 |
-
page_icon="🏢",
|
26 |
-
layout="wide",
|
27 |
-
initial_sidebar_state="expanded"
|
28 |
-
)
|
29 |
-
|
30 |
-
# Custom CSS
|
31 |
-
st.markdown("""
|
32 |
-
<style>
|
33 |
-
.main {
|
34 |
-
background-color: #f5f5f5;
|
35 |
-
}
|
36 |
-
.stApp {
|
37 |
-
max-width: 1200px;
|
38 |
-
margin: 0 auto;
|
39 |
-
}
|
40 |
-
.chat-message {
|
41 |
-
padding: 1.5rem;
|
42 |
-
border-radius: 0.5rem;
|
43 |
-
margin-bottom: 1rem;
|
44 |
-
display: flex;
|
45 |
-
flex-direction: column;
|
46 |
-
}
|
47 |
-
.chat-message.user {
|
48 |
-
background-color: #e3f2fd;
|
49 |
-
}
|
50 |
-
.chat-message.assistant {
|
51 |
-
background-color: #f3e5f5;
|
52 |
-
}
|
53 |
-
.chat-message .message {
|
54 |
-
margin-top: 0.5rem;
|
55 |
-
}
|
56 |
-
</style>
|
57 |
-
""", unsafe_allow_html=True)
|
58 |
-
|
59 |
-
# Initialize session state
|
60 |
-
if 'rag_engine' not in st.session_state:
|
61 |
-
st.session_state.rag_engine = None
|
62 |
-
if 'pdf_processor' not in st.session_state:
|
63 |
-
st.session_state.pdf_processor = PDFProcessor()
|
64 |
-
if 'chat_history' not in st.session_state:
|
65 |
-
st.session_state.chat_history = []
|
66 |
-
if 'uploaded_pdfs' not in st.session_state:
|
67 |
-
st.session_state.uploaded_pdfs = set()
|
68 |
-
|
69 |
-
def initialize_rag_engine(deployment_name: str):
|
70 |
-
"""Initialize the RAG engine with error handling."""
|
71 |
-
try:
|
72 |
-
st.session_state.rag_engine = RAGEngine(deployment_name)
|
73 |
-
logger.info("RAG Engine initialized successfully")
|
74 |
-
except Exception as e:
|
75 |
-
logger.error(f"Error initializing the application: {str(e)}")
|
76 |
-
st.error(f"Error initializing the application: {str(e)}")
|
77 |
-
|
78 |
-
def process_pdf(pdf_file):
|
79 |
-
"""Process uploaded PDF file."""
|
80 |
-
try:
|
81 |
-
# Check if PDF was already processed
|
82 |
-
if pdf_file.name in st.session_state.uploaded_pdfs:
|
83 |
-
st.warning(f"'{pdf_file.name}' has already been processed!")
|
84 |
-
return
|
85 |
-
|
86 |
-
with st.spinner(f"Processing {pdf_file.name}..."):
|
87 |
-
# Read PDF content
|
88 |
-
pdf_content = pdf_file.read()
|
89 |
-
|
90 |
-
# Process PDF and get chunks
|
91 |
-
chunks = st.session_state.pdf_processor.process_pdf(
|
92 |
-
BytesIO(pdf_content)
|
93 |
-
)
|
94 |
-
|
95 |
-
# Add chunks to vector store
|
96 |
-
texts = [chunk[0] for chunk in chunks]
|
97 |
-
metadata = [{"source": pdf_file.name, **chunk[1]} for chunk in chunks]
|
98 |
-
st.session_state.rag_engine.add_documents(texts, metadata)
|
99 |
-
|
100 |
-
# Mark PDF as processed
|
101 |
-
st.session_state.uploaded_pdfs.add(pdf_file.name)
|
102 |
-
|
103 |
-
st.success(f"Successfully processed '{pdf_file.name}'!")
|
104 |
-
logger.info(f"PDF '{pdf_file.name}' processed and added to vector store")
|
105 |
-
|
106 |
-
except Exception as e:
|
107 |
-
logger.error(f"Error processing PDF: {str(e)}")
|
108 |
-
st.error(f"Error processing PDF: {str(e)}")
|
109 |
-
|
110 |
-
def display_chat_message(role: str, content: str):
|
111 |
-
"""Display a chat message with proper styling."""
|
112 |
-
with st.container():
|
113 |
-
st.markdown(f"""
|
114 |
-
<div class="chat-message {role}">
|
115 |
-
<div class="role"><strong>{'You' if role == 'user' else 'Assistant'}:</strong></div>
|
116 |
-
<div class="message">{content}</div>
|
117 |
-
</div>
|
118 |
-
""", unsafe_allow_html=True)
|
119 |
-
|
120 |
-
def main():
|
121 |
-
"""Main application function."""
|
122 |
-
# Header
|
123 |
-
col1, col2 = st.columns([2, 1])
|
124 |
-
with col1:
|
125 |
-
st.title("🏢 CRE Knowledge Assistant")
|
126 |
-
st.markdown("*Your AI guide for commercial real estate concepts*")
|
127 |
-
|
128 |
-
# Sidebar
|
129 |
-
with st.sidebar:
|
130 |
-
st.header("📚 Knowledge Base")
|
131 |
-
st.markdown("Upload your CRE documents to enhance the assistant's knowledge.")
|
132 |
-
|
133 |
-
# Model configuration (collapsible)
|
134 |
-
with st.expander("⚙️ Model Configuration"):
|
135 |
-
deployment_name = st.text_input(
|
136 |
-
"Model Deployment Name",
|
137 |
-
value=AZURE_OPENAI_DEPLOYMENT_NAME,
|
138 |
-
help="Enter your Azure OpenAI model deployment name"
|
139 |
-
)
|
140 |
-
|
141 |
-
# Initialize RAG engine if not already done
|
142 |
-
if not st.session_state.rag_engine:
|
143 |
-
initialize_rag_engine(deployment_name)
|
144 |
-
|
145 |
-
# PDF upload section
|
146 |
-
st.subheader("📄 Upload Documents")
|
147 |
-
uploaded_files = st.file_uploader(
|
148 |
-
"Choose PDF files",
|
149 |
-
type="pdf",
|
150 |
-
accept_multiple_files=True,
|
151 |
-
help="Upload one or more PDF files to add to the knowledge base"
|
152 |
-
)
|
153 |
-
|
154 |
-
if uploaded_files:
|
155 |
-
for pdf_file in uploaded_files:
|
156 |
-
process_pdf(pdf_file)
|
157 |
-
|
158 |
-
# Show processed documents
|
159 |
-
if st.session_state.uploaded_pdfs:
|
160 |
-
st.subheader("📚 Processed Documents")
|
161 |
-
for pdf_name in st.session_state.uploaded_pdfs:
|
162 |
-
st.markdown(f"✓ {pdf_name}")
|
163 |
-
|
164 |
-
# Main chat interface
|
165 |
-
if st.session_state.rag_engine:
|
166 |
-
# Display chat history
|
167 |
-
for message in st.session_state.chat_history:
|
168 |
-
display_chat_message(
|
169 |
-
role=message["role"],
|
170 |
-
content=message["content"]
|
171 |
-
)
|
172 |
-
|
173 |
-
# Chat input
|
174 |
-
user_question = st.text_input(
|
175 |
-
"Ask a question about commercial real estate:",
|
176 |
-
placeholder="e.g., What is LTV? How is DSCR calculated?",
|
177 |
-
key="user_question"
|
178 |
-
)
|
179 |
-
|
180 |
-
if user_question:
|
181 |
-
try:
|
182 |
-
# Add user message to chat
|
183 |
-
st.session_state.chat_history.append({
|
184 |
-
"role": "user",
|
185 |
-
"content": user_question
|
186 |
-
})
|
187 |
-
|
188 |
-
with st.spinner("Generating answer..."):
|
189 |
-
response = st.session_state.rag_engine.query(user_question)
|
190 |
-
|
191 |
-
# Add assistant response to chat
|
192 |
-
st.session_state.chat_history.append({
|
193 |
-
"role": "assistant",
|
194 |
-
"content": response["answer"]
|
195 |
-
})
|
196 |
-
|
197 |
-
# Display latest messages immediately
|
198 |
-
display_chat_message("user", user_question)
|
199 |
-
display_chat_message("assistant", response["answer"])
|
200 |
-
|
201 |
-
except Exception as e:
|
202 |
-
logger.error(f"Error generating answer: {str(e)}")
|
203 |
-
st.error(f"Error generating answer: {str(e)}")
|
204 |
-
|
205 |
-
else:
|
206 |
-
st.info("👆 Please upload PDF documents in the sidebar to start asking questions!")
|
207 |
-
|
208 |
-
if __name__ == "__main__":
|
209 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docker-compose.yml
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
version: '3.8'
|
2 |
-
|
3 |
-
services:
|
4 |
-
chatbot:
|
5 |
-
build: .
|
6 |
-
ports:
|
7 |
-
- "8501:8501"
|
8 |
-
environment:
|
9 |
-
- AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_ENDPOINT}
|
10 |
-
- AZURE_OPENAI_KEY=${AZURE_OPENAI_KEY}
|
11 |
-
- AZURE_OPENAI_DEPLOYMENT_NAME=${AZURE_OPENAI_DEPLOYMENT_NAME}
|
12 |
-
- AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME=${AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME}
|
13 |
-
volumes:
|
14 |
-
- ./vector_store:/app/vector_store
|
15 |
-
- ./logs:/app/logs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
frontend/main.py
DELETED
@@ -1,59 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import requests
|
3 |
-
import sys
|
4 |
-
import os
|
5 |
-
|
6 |
-
# Add the project root to Python path
|
7 |
-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
8 |
-
|
9 |
-
from app.config import validate_config
|
10 |
-
from app.logging import setup_logging
|
11 |
-
|
12 |
-
def main():
|
13 |
-
# Setup logging
|
14 |
-
setup_logging()
|
15 |
-
|
16 |
-
st.set_page_config(
|
17 |
-
page_title="CRE Knowledge Assistant",
|
18 |
-
page_icon="🤖",
|
19 |
-
layout="wide"
|
20 |
-
)
|
21 |
-
|
22 |
-
st.title("CRE Knowledge Assistant")
|
23 |
-
|
24 |
-
# File uploader
|
25 |
-
uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
|
26 |
-
|
27 |
-
if uploaded_file:
|
28 |
-
# Convert file to bytes
|
29 |
-
file_bytes = uploaded_file.getvalue()
|
30 |
-
|
31 |
-
# Send to API endpoint
|
32 |
-
response = requests.post(
|
33 |
-
"api/process_pdf",
|
34 |
-
files={"file": (uploaded_file.name, file_bytes, "application/pdf")}
|
35 |
-
)
|
36 |
-
|
37 |
-
if response.status_code == 200:
|
38 |
-
st.success("PDF processed successfully!")
|
39 |
-
else:
|
40 |
-
st.error("Error processing PDF")
|
41 |
-
|
42 |
-
# Query input
|
43 |
-
query = st.text_input("Ask a question about your documents:")
|
44 |
-
|
45 |
-
if query:
|
46 |
-
# Send query to API endpoint
|
47 |
-
response = requests.post(
|
48 |
-
"api/query",
|
49 |
-
json={"query": query}
|
50 |
-
)
|
51 |
-
|
52 |
-
if response.status_code == 200:
|
53 |
-
result = response.json()
|
54 |
-
st.write("Answer:", result["answer"])
|
55 |
-
else:
|
56 |
-
st.error("Error processing query")
|
57 |
-
|
58 |
-
if __name__ == "__main__":
|
59 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
frontend/requirements.txt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
streamlit==1.29.0
|
2 |
-
requests==2.31.0
|
3 |
-
python-dotenv==1.0.0
|
|
|
|
|
|
|
|
index.html
DELETED
@@ -1,30 +0,0 @@
|
|
1 |
-
<!DOCTYPE html>
|
2 |
-
<html>
|
3 |
-
<head>
|
4 |
-
<title>CRE Knowledge Assistant</title>
|
5 |
-
<style>
|
6 |
-
body, html {
|
7 |
-
margin: 0;
|
8 |
-
padding: 0;
|
9 |
-
height: 100%;
|
10 |
-
overflow: hidden;
|
11 |
-
}
|
12 |
-
iframe {
|
13 |
-
width: 100%;
|
14 |
-
height: 100vh;
|
15 |
-
border: none;
|
16 |
-
}
|
17 |
-
</style>
|
18 |
-
</head>
|
19 |
-
<body>
|
20 |
-
<iframe src="/api" allow="camera;microphone"></iframe>
|
21 |
-
<script>
|
22 |
-
window.addEventListener('message', function(e) {
|
23 |
-
// Handle any messages from the Streamlit app
|
24 |
-
if (e.data.type === 'streamlit') {
|
25 |
-
console.log('Received message from Streamlit:', e.data);
|
26 |
-
}
|
27 |
-
});
|
28 |
-
</script>
|
29 |
-
</body>
|
30 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
package-lock.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
package.json
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"name": "cre-chatbot-rag",
|
3 |
-
"version": "1.0.0",
|
4 |
-
"private": true,
|
5 |
-
"scripts": {
|
6 |
-
"dev": "next dev",
|
7 |
-
"build": "next build",
|
8 |
-
"start": "next start"
|
9 |
-
},
|
10 |
-
"dependencies": {
|
11 |
-
"@chakra-ui/react": "^2.8.2",
|
12 |
-
"@emotion/react": "^11.11.1",
|
13 |
-
"@emotion/styled": "^11.11.0",
|
14 |
-
"axios": "^1.6.2",
|
15 |
-
"framer-motion": "^10.16.16",
|
16 |
-
"next": "^14.0.4",
|
17 |
-
"react": "^18.2.0",
|
18 |
-
"react-dom": "^18.2.0",
|
19 |
-
"vercel": "^39.1.1"
|
20 |
-
}
|
21 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pdf_processor.py
DELETED
@@ -1,42 +0,0 @@
|
|
1 |
-
from typing import List, Dict
|
2 |
-
from langchain.document_loaders import PyPDFLoader
|
3 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
-
|
5 |
-
class PDFProcessor:
|
6 |
-
def __init__(self):
|
7 |
-
self.text_splitter = RecursiveCharacterTextSplitter(
|
8 |
-
chunk_size=1000,
|
9 |
-
chunk_overlap=200,
|
10 |
-
length_function=len,
|
11 |
-
separators=["\n\n", "\n", " ", ""]
|
12 |
-
)
|
13 |
-
|
14 |
-
def process_pdf(self, pdf_path: str) -> List[Dict]:
|
15 |
-
"""
|
16 |
-
Process a PDF file and return chunks of text with metadata.
|
17 |
-
|
18 |
-
Args:
|
19 |
-
pdf_path (str): Path to the PDF file
|
20 |
-
|
21 |
-
Returns:
|
22 |
-
List[Dict]: List of dictionaries containing text chunks and metadata
|
23 |
-
"""
|
24 |
-
# Load PDF
|
25 |
-
loader = PyPDFLoader(pdf_path)
|
26 |
-
pages = loader.load()
|
27 |
-
|
28 |
-
# Split text into chunks
|
29 |
-
chunks = self.text_splitter.split_documents(pages)
|
30 |
-
|
31 |
-
# Format chunks with metadata
|
32 |
-
processed_chunks = []
|
33 |
-
for chunk in chunks:
|
34 |
-
processed_chunks.append({
|
35 |
-
'text': chunk.page_content,
|
36 |
-
'metadata': {
|
37 |
-
'page': chunk.metadata.get('page', 0) + 1,
|
38 |
-
'source': pdf_path
|
39 |
-
}
|
40 |
-
})
|
41 |
-
|
42 |
-
return processed_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rag_engine.py
DELETED
@@ -1,112 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from typing import List, Dict
|
3 |
-
from dotenv import load_dotenv
|
4 |
-
import chromadb
|
5 |
-
from langchain.embeddings import AzureOpenAIEmbeddings
|
6 |
-
from langchain.vectorstores import Chroma
|
7 |
-
from langchain.chat_models import AzureChatOpenAI
|
8 |
-
from langchain.chains import RetrievalQA
|
9 |
-
import time
|
10 |
-
|
11 |
-
# Load environment variables
|
12 |
-
load_dotenv()
|
13 |
-
|
14 |
-
class RAGEngine:
|
15 |
-
def __init__(self):
|
16 |
-
# Verify Azure OpenAI settings are set
|
17 |
-
required_vars = [
|
18 |
-
'AZURE_OPENAI_ENDPOINT',
|
19 |
-
'AZURE_OPENAI_KEY',
|
20 |
-
'AZURE_OPENAI_DEPLOYMENT_NAME',
|
21 |
-
'AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'
|
22 |
-
]
|
23 |
-
|
24 |
-
missing_vars = [var for var in required_vars if not os.getenv(var)]
|
25 |
-
if missing_vars:
|
26 |
-
raise ValueError(f"Missing required Azure OpenAI settings: {', '.join(missing_vars)}")
|
27 |
-
|
28 |
-
# Initialize with retry mechanism
|
29 |
-
max_retries = 3
|
30 |
-
for attempt in range(max_retries):
|
31 |
-
try:
|
32 |
-
self.embeddings = AzureOpenAIEmbeddings(
|
33 |
-
azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
|
34 |
-
azure_deployment=os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'),
|
35 |
-
api_key=os.getenv('AZURE_OPENAI_KEY')
|
36 |
-
)
|
37 |
-
self.vector_store = None
|
38 |
-
self.qa_chain = None
|
39 |
-
# Test connection
|
40 |
-
self.embeddings.embed_query("test")
|
41 |
-
break
|
42 |
-
except Exception as e:
|
43 |
-
if attempt == max_retries - 1:
|
44 |
-
raise ConnectionError(f"Failed to connect to Azure OpenAI API after {max_retries} attempts. Error: {str(e)}")
|
45 |
-
time.sleep(2) # Wait before retrying
|
46 |
-
|
47 |
-
def initialize_vector_store(self, chunks: List[Dict]):
|
48 |
-
"""
|
49 |
-
Initialize the vector store with document chunks.
|
50 |
-
|
51 |
-
Args:
|
52 |
-
chunks (List[Dict]): List of dictionaries containing text and metadata
|
53 |
-
"""
|
54 |
-
texts = [chunk['text'] for chunk in chunks]
|
55 |
-
metadatas = [chunk['metadata'] for chunk in chunks]
|
56 |
-
|
57 |
-
# Create vector store
|
58 |
-
self.vector_store = Chroma.from_texts(
|
59 |
-
texts=texts,
|
60 |
-
embedding=self.embeddings,
|
61 |
-
metadatas=metadatas
|
62 |
-
)
|
63 |
-
|
64 |
-
# Initialize QA chain
|
65 |
-
llm = AzureChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", azure_deployment_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME'), azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'), api_key=os.getenv('AZURE_OPENAI_KEY'))
|
66 |
-
self.qa_chain = RetrievalQA.from_chain_type(
|
67 |
-
llm=llm,
|
68 |
-
chain_type="stuff",
|
69 |
-
retriever=self.vector_store.as_retriever(
|
70 |
-
search_kwargs={"k": 3}
|
71 |
-
)
|
72 |
-
)
|
73 |
-
|
74 |
-
def answer_question(self, question: str) -> Dict:
|
75 |
-
"""
|
76 |
-
Answer a question using the RAG system.
|
77 |
-
|
78 |
-
Args:
|
79 |
-
question (str): User's question
|
80 |
-
|
81 |
-
Returns:
|
82 |
-
Dict: Answer and source information
|
83 |
-
"""
|
84 |
-
if not self.qa_chain:
|
85 |
-
raise ValueError("Vector store not initialized. Please process documents first.")
|
86 |
-
|
87 |
-
# Create a prompt that emphasizes definition extraction
|
88 |
-
prompt = f"""
|
89 |
-
Question: {question}
|
90 |
-
Please provide a clear and concise answer based on the provided context.
|
91 |
-
If the question asks for a definition or explanation of a concept,
|
92 |
-
make sure to provide that specifically. Include relevant examples or
|
93 |
-
additional context only if they help clarify the concept.
|
94 |
-
"""
|
95 |
-
|
96 |
-
# Get answer from QA chain
|
97 |
-
result = self.qa_chain({"query": prompt})
|
98 |
-
|
99 |
-
# Get source documents
|
100 |
-
source_docs = self.vector_store.similarity_search(question, k=2)
|
101 |
-
sources = [
|
102 |
-
{
|
103 |
-
'page': doc.metadata['page'],
|
104 |
-
'text': doc.page_content[:200] + "..." # Preview of source text
|
105 |
-
}
|
106 |
-
for doc in source_docs
|
107 |
-
]
|
108 |
-
|
109 |
-
return {
|
110 |
-
'answer': result['result'],
|
111 |
-
'sources': sources
|
112 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
railway.toml
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
[build]
|
2 |
-
builder = "NIXPACKS"
|
3 |
-
buildCommand = "apt-get update && apt-get install -y build-essential && pip install --upgrade pip && pip install -r requirements.txt"
|
4 |
-
|
5 |
-
[deploy]
|
6 |
-
startCommand = "streamlit run streamlit_app.py --server.address=0.0.0.0 --server.port=$PORT"
|
7 |
-
restartPolicyType = "ON_FAILURE"
|
8 |
-
restartPolicyMaxRetries = 10
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements-dev.txt
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
-r requirements.txt
|
2 |
-
|
3 |
-
# Testing
|
4 |
-
pytest==7.4.3
|
5 |
-
pytest-cov==4.1.0
|
6 |
-
|
7 |
-
# Linting
|
8 |
-
flake8==6.1.0
|
9 |
-
black==23.11.0
|
10 |
-
|
11 |
-
# Type checking
|
12 |
-
mypy==1.7.1
|
13 |
-
|
14 |
-
# Documentation
|
15 |
-
sphinx==7.2.6
|
16 |
-
sphinx-rtd-theme==1.3.0
|
17 |
-
|
18 |
-
# Development tools
|
19 |
-
pre-commit==3.5.0
|
20 |
-
python-dotenv==1.0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
streamlit==1.29.0
|
2 |
-
openai==1.6.1
|
3 |
-
python-dotenv==1.0.0
|
4 |
-
PyPDF2==3.0.1
|
5 |
-
langchain==0.0.352
|
6 |
-
chromadb==0.3.26
|
7 |
-
pydantic==1.10.13
|
8 |
-
azure-storage-blob==12.19.0
|
9 |
-
numpy==1.22.4
|
10 |
-
duckdb==0.9.2
|
11 |
-
typing-inspect==0.8.0
|
12 |
-
overrides==7.3.1
|
13 |
-
SQLAlchemy==2.0.19
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements_api.txt
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
openai==1.6.1
|
2 |
-
python-dotenv==1.0.0
|
3 |
-
PyPDF2==3.0.1
|
4 |
-
langchain==0.0.352
|
|
|
|
|
|
|
|
|
|
src/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
|
|
|
|
src/pdf_processor.py
DELETED
@@ -1,112 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
PDF processing module for extracting and chunking text from PDF documents.
|
3 |
-
"""
|
4 |
-
import logging
|
5 |
-
from typing import List, Tuple
|
6 |
-
import PyPDF2
|
7 |
-
from io import BytesIO
|
8 |
-
|
9 |
-
from app.config import MAX_CHUNK_SIZE, OVERLAP_SIZE
|
10 |
-
|
11 |
-
logger = logging.getLogger('pdf')
|
12 |
-
|
13 |
-
class PDFProcessor:
|
14 |
-
"""Handles PDF document processing and text chunking."""
|
15 |
-
|
16 |
-
@staticmethod
|
17 |
-
def extract_text(pdf_file: BytesIO) -> str:
|
18 |
-
"""Extract text content from a PDF file."""
|
19 |
-
try:
|
20 |
-
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
21 |
-
text = ""
|
22 |
-
|
23 |
-
for page in pdf_reader.pages:
|
24 |
-
text += page.extract_text() + "\n"
|
25 |
-
|
26 |
-
logger.info(f"Successfully extracted text from PDF ({len(text)} characters)")
|
27 |
-
return text
|
28 |
-
|
29 |
-
except Exception as e:
|
30 |
-
logger.error(f"Error extracting text from PDF: {str(e)}")
|
31 |
-
raise
|
32 |
-
|
33 |
-
@staticmethod
|
34 |
-
def create_chunks(text: str, chunk_size: int = MAX_CHUNK_SIZE,
|
35 |
-
overlap: int = OVERLAP_SIZE) -> List[Tuple[str, dict]]:
|
36 |
-
"""Split text into overlapping chunks with metadata."""
|
37 |
-
try:
|
38 |
-
chunks = []
|
39 |
-
start = 0
|
40 |
-
|
41 |
-
while start < len(text):
|
42 |
-
# Find the end of the chunk
|
43 |
-
end = start + chunk_size
|
44 |
-
|
45 |
-
# If we're not at the end of the text, try to find a good break point
|
46 |
-
if end < len(text):
|
47 |
-
# Try to find the last period or newline in the chunk
|
48 |
-
last_period = text.rfind('.', start, end)
|
49 |
-
last_newline = text.rfind('\n', start, end)
|
50 |
-
break_point = max(last_period, last_newline)
|
51 |
-
|
52 |
-
if break_point > start:
|
53 |
-
end = break_point + 1
|
54 |
-
|
55 |
-
# Create chunk with metadata
|
56 |
-
chunk_text = text[start:end].strip()
|
57 |
-
if chunk_text: # Only add non-empty chunks
|
58 |
-
metadata = {
|
59 |
-
"start_char": start,
|
60 |
-
"end_char": end,
|
61 |
-
"chunk_size": len(chunk_text)
|
62 |
-
}
|
63 |
-
chunks.append((chunk_text, metadata))
|
64 |
-
|
65 |
-
# Move the start position, accounting for overlap
|
66 |
-
start = end - overlap if end < len(text) else len(text)
|
67 |
-
|
68 |
-
logger.info(f"Created {len(chunks)} chunks from text")
|
69 |
-
return chunks
|
70 |
-
|
71 |
-
except Exception as e:
|
72 |
-
logger.error(f"Error creating chunks: {str(e)}")
|
73 |
-
raise
|
74 |
-
|
75 |
-
@staticmethod
|
76 |
-
def clean_text(text: str) -> str:
|
77 |
-
"""Clean and normalize extracted text."""
|
78 |
-
try:
|
79 |
-
# Remove extra whitespace
|
80 |
-
text = ' '.join(text.split())
|
81 |
-
|
82 |
-
# Remove special characters that might cause issues
|
83 |
-
text = text.replace('\x00', '')
|
84 |
-
|
85 |
-
# Normalize newlines
|
86 |
-
text = text.replace('\r\n', '\n')
|
87 |
-
|
88 |
-
logger.info("Text cleaned successfully")
|
89 |
-
return text
|
90 |
-
|
91 |
-
except Exception as e:
|
92 |
-
logger.error(f"Error cleaning text: {str(e)}")
|
93 |
-
raise
|
94 |
-
|
95 |
-
def process_pdf(self, pdf_file: BytesIO) -> List[Tuple[str, dict]]:
|
96 |
-
"""Process PDF file and return chunks with metadata."""
|
97 |
-
try:
|
98 |
-
# Extract text from PDF
|
99 |
-
raw_text = self.extract_text(pdf_file)
|
100 |
-
|
101 |
-
# Clean the extracted text
|
102 |
-
cleaned_text = self.clean_text(raw_text)
|
103 |
-
|
104 |
-
# Create chunks
|
105 |
-
chunks = self.create_chunks(cleaned_text)
|
106 |
-
|
107 |
-
logger.info(f"PDF processed successfully: {len(chunks)} chunks created")
|
108 |
-
return chunks
|
109 |
-
|
110 |
-
except Exception as e:
|
111 |
-
logger.error(f"Error processing PDF: {str(e)}")
|
112 |
-
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/rag_engine.py
DELETED
@@ -1,131 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
RAG (Retrieval Augmented Generation) engine for the CRE Chatbot.
|
3 |
-
"""
|
4 |
-
import logging
|
5 |
-
import os
|
6 |
-
from typing import List, Dict, Any, Optional
|
7 |
-
|
8 |
-
import chromadb
|
9 |
-
from chromadb.config import Settings
|
10 |
-
from openai import AzureOpenAI
|
11 |
-
from app.config import (
|
12 |
-
AZURE_OPENAI_ENDPOINT,
|
13 |
-
AZURE_OPENAI_API_KEY, # Added this line
|
14 |
-
TEMPERATURE,
|
15 |
-
MAX_TOKENS,
|
16 |
-
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME
|
17 |
-
)
|
18 |
-
|
19 |
-
logger = logging.getLogger('rag')
|
20 |
-
|
21 |
-
class RAGEngine:
|
22 |
-
"""Handles document retrieval and question answering using Azure OpenAI."""
|
23 |
-
|
24 |
-
def __init__(self, deployment_name: str):
|
25 |
-
"""Initialize the RAG engine with Azure OpenAI client."""
|
26 |
-
self.client = AzureOpenAI(
|
27 |
-
api_key=AZURE_OPENAI_API_KEY,
|
28 |
-
api_version="2023-12-01-preview",
|
29 |
-
azure_endpoint=AZURE_OPENAI_ENDPOINT
|
30 |
-
)
|
31 |
-
self.deployment_name = deployment_name
|
32 |
-
self.embedding_deployment_name = AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME
|
33 |
-
|
34 |
-
# Initialize ChromaDB with simple in-memory settings
|
35 |
-
self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
|
36 |
-
self.collection = None
|
37 |
-
self.initialize_vector_store("cre_docs")
|
38 |
-
logger.info("RAG Engine initialized with Azure OpenAI")
|
39 |
-
|
40 |
-
def create_embeddings(self, texts: List[str]) -> List[List[float]]:
|
41 |
-
"""Create embeddings for the given texts using Azure OpenAI."""
|
42 |
-
try:
|
43 |
-
response = self.client.embeddings.create(
|
44 |
-
input=texts,
|
45 |
-
model=self.embedding_deployment_name
|
46 |
-
)
|
47 |
-
return [item.embedding for item in response.data]
|
48 |
-
except Exception as e:
|
49 |
-
logger.error(f"Error creating embeddings: {str(e)}")
|
50 |
-
raise
|
51 |
-
|
52 |
-
def initialize_vector_store(self, collection_name: str):
|
53 |
-
"""Initialize or get the vector store collection."""
|
54 |
-
try:
|
55 |
-
self.collection = self.chroma_client.get_or_create_collection(
|
56 |
-
name=collection_name,
|
57 |
-
metadata={"hnsw:space": "cosine"}
|
58 |
-
)
|
59 |
-
logger.info(f"Vector store initialized with collection: {collection_name}")
|
60 |
-
except Exception as e:
|
61 |
-
logger.error(f"Error initializing vector store: {str(e)}")
|
62 |
-
raise
|
63 |
-
|
64 |
-
def add_documents(self, texts: List[str], metadata: Optional[List[Dict[str, Any]]] = None):
|
65 |
-
"""Add documents to the vector store."""
|
66 |
-
try:
|
67 |
-
if not self.collection:
|
68 |
-
raise ValueError("Vector store collection not initialized")
|
69 |
-
|
70 |
-
embeddings = self.create_embeddings(texts)
|
71 |
-
# Use timestamp + index as ID to ensure uniqueness
|
72 |
-
import time
|
73 |
-
timestamp = int(time.time())
|
74 |
-
ids = [f"{timestamp}_{i}" for i in range(len(texts))]
|
75 |
-
|
76 |
-
self.collection.add(
|
77 |
-
embeddings=embeddings,
|
78 |
-
documents=texts,
|
79 |
-
ids=ids,
|
80 |
-
metadatas=metadata if metadata else [{}] * len(texts)
|
81 |
-
)
|
82 |
-
logger.info(f"Added {len(texts)} documents to vector store")
|
83 |
-
except Exception as e:
|
84 |
-
logger.error(f"Error adding documents: {str(e)}")
|
85 |
-
raise
|
86 |
-
|
87 |
-
def query(self, question: str, k: int = 3) -> Dict[str, Any]:
|
88 |
-
"""Query the vector store and generate an answer."""
|
89 |
-
try:
|
90 |
-
# Create embedding for the question
|
91 |
-
question_embedding = self.create_embeddings([question])[0]
|
92 |
-
|
93 |
-
# Query vector store
|
94 |
-
results = self.collection.query(
|
95 |
-
query_embeddings=[question_embedding],
|
96 |
-
n_results=k
|
97 |
-
)
|
98 |
-
|
99 |
-
# Prepare context from retrieved documents
|
100 |
-
context = "\n".join(results['documents'][0])
|
101 |
-
|
102 |
-
# Generate answer using Azure OpenAI
|
103 |
-
messages = [
|
104 |
-
{"role": "system", "content": "You are a helpful assistant that answers questions about commercial real estate concepts. Use the provided context to answer questions accurately and concisely."},
|
105 |
-
{"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}
|
106 |
-
]
|
107 |
-
|
108 |
-
response = self.client.chat.completions.create(
|
109 |
-
model=self.deployment_name,
|
110 |
-
messages=messages,
|
111 |
-
temperature=TEMPERATURE,
|
112 |
-
max_tokens=MAX_TOKENS
|
113 |
-
)
|
114 |
-
|
115 |
-
answer = response.choices[0].message.content
|
116 |
-
|
117 |
-
return {
|
118 |
-
"answer": answer,
|
119 |
-
"context": context,
|
120 |
-
"source_documents": results['documents'][0]
|
121 |
-
}
|
122 |
-
|
123 |
-
except Exception as e:
|
124 |
-
logger.error(f"Error querying RAG engine: {str(e)}")
|
125 |
-
raise
|
126 |
-
|
127 |
-
def clear(self):
|
128 |
-
"""Clear the vector store collection."""
|
129 |
-
if self.collection:
|
130 |
-
self.collection.delete()
|
131 |
-
logger.info("Vector store collection cleared")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
startup.sh
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
#!/bin/sh
|
2 |
-
streamlit run app/main.py --server.port 8000 --server.address 0.0.0.0
|
|
|
|
|
|
staticwebapp.config.json
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"routes": [
|
3 |
-
{
|
4 |
-
"route": "/api/*",
|
5 |
-
"serve": "/api",
|
6 |
-
"methods": ["GET", "POST"]
|
7 |
-
},
|
8 |
-
{
|
9 |
-
"route": "/*",
|
10 |
-
"serve": "/",
|
11 |
-
"statusCode": 200
|
12 |
-
}
|
13 |
-
],
|
14 |
-
"navigationFallback": {
|
15 |
-
"rewrite": "/index.html"
|
16 |
-
},
|
17 |
-
"platform": {
|
18 |
-
"apiRuntime": "python:3.11"
|
19 |
-
},
|
20 |
-
"globalHeaders": {
|
21 |
-
"Content-Security-Policy": "default-src * 'unsafe-inline' 'unsafe-eval' data: blob:;",
|
22 |
-
"Access-Control-Allow-Origin": "*"
|
23 |
-
},
|
24 |
-
"buildProperties": {
|
25 |
-
"appLocation": "/frontend",
|
26 |
-
"apiLocation": "/api",
|
27 |
-
"outputLocation": "",
|
28 |
-
"apiBuildCommand": "pip install -r requirements.txt",
|
29 |
-
"appBuildCommand": "pip install -r requirements.txt"
|
30 |
-
}
|
31 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit_app.py
DELETED
@@ -1,43 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import os
|
3 |
-
from dotenv import load_dotenv
|
4 |
-
from pdf_processor import PDFProcessor
|
5 |
-
from rag_engine import RAGEngine
|
6 |
-
from app.config import AZURE_OPENAI_DEPLOYMENT_NAME
|
7 |
-
|
8 |
-
# Load environment variables
|
9 |
-
load_dotenv()
|
10 |
-
|
11 |
-
# Initialize components
|
12 |
-
pdf_processor = PDFProcessor()
|
13 |
-
rag_engine = RAGEngine(deployment_name=AZURE_OPENAI_DEPLOYMENT_NAME)
|
14 |
-
|
15 |
-
def main():
|
16 |
-
st.set_page_config(
|
17 |
-
page_title="CRE Knowledge Assistant",
|
18 |
-
page_icon="🤖",
|
19 |
-
layout="wide"
|
20 |
-
)
|
21 |
-
|
22 |
-
st.title("CRE Knowledge Assistant 🏢")
|
23 |
-
|
24 |
-
# File uploader
|
25 |
-
uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
|
26 |
-
|
27 |
-
if uploaded_file:
|
28 |
-
try:
|
29 |
-
# Process the PDF
|
30 |
-
pdf_processor.process(uploaded_file)
|
31 |
-
st.success("PDF processed successfully! You can now ask questions about it.")
|
32 |
-
|
33 |
-
# Show chat interface
|
34 |
-
user_question = st.text_input("Ask a question about the document:")
|
35 |
-
if user_question:
|
36 |
-
response = rag_engine.get_response(user_question)
|
37 |
-
st.write("Answer:", response)
|
38 |
-
|
39 |
-
except Exception as e:
|
40 |
-
st.error(f"Error processing PDF: {str(e)}")
|
41 |
-
|
42 |
-
if __name__ == "__main__":
|
43 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
|
|
|
|
tests/test_pdf_processor.py
DELETED
@@ -1,73 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Tests for the PDF processor module.
|
3 |
-
"""
|
4 |
-
import pytest
|
5 |
-
from io import BytesIO
|
6 |
-
from src.pdf_processor import PDFProcessor
|
7 |
-
|
8 |
-
def test_clean_text():
|
9 |
-
"""Test text cleaning functionality."""
|
10 |
-
processor = PDFProcessor()
|
11 |
-
|
12 |
-
# Test removing extra whitespace
|
13 |
-
text = "This has extra spaces"
|
14 |
-
assert processor.clean_text(text) == "This has extra spaces"
|
15 |
-
|
16 |
-
# Test normalizing newlines
|
17 |
-
text = "Line1\r\nLine2\r\nLine3"
|
18 |
-
assert processor.clean_text(text) == "Line1 Line2 Line3"
|
19 |
-
|
20 |
-
# Test removing null characters
|
21 |
-
text = "Text with\x00null\x00chars"
|
22 |
-
assert processor.clean_text(text) == "Text with null chars"
|
23 |
-
|
24 |
-
def test_create_chunks():
|
25 |
-
"""Test text chunking functionality."""
|
26 |
-
processor = PDFProcessor()
|
27 |
-
|
28 |
-
# Test basic chunking
|
29 |
-
text = "This is a test. This is another test. And a final test."
|
30 |
-
chunks = processor.create_chunks(text, chunk_size=20, overlap=5)
|
31 |
-
|
32 |
-
assert len(chunks) > 0
|
33 |
-
assert all(isinstance(chunk, tuple) for chunk in chunks)
|
34 |
-
assert all(len(chunk) == 2 for chunk in chunks) # (text, metadata)
|
35 |
-
assert all(isinstance(chunk[1], dict) for chunk in chunks) # metadata is dict
|
36 |
-
|
37 |
-
def test_chunk_metadata():
|
38 |
-
"""Test chunk metadata creation."""
|
39 |
-
processor = PDFProcessor()
|
40 |
-
|
41 |
-
text = "Short test text."
|
42 |
-
chunks = processor.create_chunks(text, chunk_size=20, overlap=5)
|
43 |
-
|
44 |
-
assert len(chunks) == 1
|
45 |
-
chunk_text, metadata = chunks[0]
|
46 |
-
|
47 |
-
assert "start_char" in metadata
|
48 |
-
assert "end_char" in metadata
|
49 |
-
assert "chunk_size" in metadata
|
50 |
-
assert metadata["chunk_size"] == len(chunk_text)
|
51 |
-
|
52 |
-
def test_empty_text():
|
53 |
-
"""Test handling of empty text."""
|
54 |
-
processor = PDFProcessor()
|
55 |
-
|
56 |
-
chunks = processor.create_chunks("")
|
57 |
-
assert len(chunks) == 0
|
58 |
-
|
59 |
-
def test_chunk_overlap():
|
60 |
-
"""Test chunk overlap functionality."""
|
61 |
-
processor = PDFProcessor()
|
62 |
-
|
63 |
-
text = "This is a long text that should be split into multiple chunks with overlap."
|
64 |
-
chunks = processor.create_chunks(text, chunk_size=20, overlap=5)
|
65 |
-
|
66 |
-
# Check that chunks overlap
|
67 |
-
if len(chunks) > 1:
|
68 |
-
for i in range(len(chunks) - 1):
|
69 |
-
current_chunk = chunks[i][0]
|
70 |
-
next_chunk = chunks[i + 1][0]
|
71 |
-
|
72 |
-
# There should be some overlap between consecutive chunks
|
73 |
-
assert any(word in next_chunk for word in current_chunk.split()[-3:])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_rag_engine.py
DELETED
@@ -1,112 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Tests for the RAG engine module.
|
3 |
-
"""
|
4 |
-
import pytest
|
5 |
-
from unittest.mock import Mock, patch
|
6 |
-
from src.rag_engine import RAGEngine
|
7 |
-
|
8 |
-
@pytest.fixture
|
9 |
-
def mock_azure_client():
|
10 |
-
"""Create a mock Azure OpenAI client."""
|
11 |
-
with patch('openai.AzureOpenAI') as mock_client:
|
12 |
-
yield mock_client
|
13 |
-
|
14 |
-
@pytest.fixture
|
15 |
-
def mock_chroma_client():
|
16 |
-
"""Create a mock Chroma client."""
|
17 |
-
with patch('chromadb.Client') as mock_client:
|
18 |
-
yield mock_client
|
19 |
-
|
20 |
-
@pytest.fixture
|
21 |
-
def rag_engine(mock_azure_client, mock_chroma_client):
|
22 |
-
"""Create a RAG engine instance with mocked dependencies."""
|
23 |
-
return RAGEngine("test-deployment")
|
24 |
-
|
25 |
-
def test_create_embeddings(rag_engine, mock_azure_client):
|
26 |
-
"""Test embedding creation."""
|
27 |
-
# Setup mock response
|
28 |
-
mock_response = Mock()
|
29 |
-
mock_response.data = [
|
30 |
-
Mock(embedding=[0.1, 0.2, 0.3]),
|
31 |
-
Mock(embedding=[0.4, 0.5, 0.6])
|
32 |
-
]
|
33 |
-
rag_engine.client.embeddings.create.return_value = mock_response
|
34 |
-
|
35 |
-
# Test
|
36 |
-
texts = ["Text 1", "Text 2"]
|
37 |
-
embeddings = rag_engine.create_embeddings(texts)
|
38 |
-
|
39 |
-
# Verify
|
40 |
-
assert len(embeddings) == 2
|
41 |
-
assert all(isinstance(emb, list) for emb in embeddings)
|
42 |
-
assert len(embeddings[0]) == 3 # Embedding dimension
|
43 |
-
|
44 |
-
def test_initialize_vector_store(rag_engine):
|
45 |
-
"""Test vector store initialization."""
|
46 |
-
rag_engine.initialize_vector_store("test_collection")
|
47 |
-
|
48 |
-
# Verify the collection was created
|
49 |
-
assert rag_engine.collection is not None
|
50 |
-
|
51 |
-
def test_add_documents(rag_engine):
|
52 |
-
"""Test adding documents to vector store."""
|
53 |
-
# Setup
|
54 |
-
rag_engine.initialize_vector_store("test_collection")
|
55 |
-
texts = ["Document 1", "Document 2"]
|
56 |
-
metadata = [{"source": "test1"}, {"source": "test2"}]
|
57 |
-
|
58 |
-
# Create mock embeddings
|
59 |
-
with patch.object(rag_engine, 'create_embeddings') as mock_create_embeddings:
|
60 |
-
mock_create_embeddings.return_value = [[0.1, 0.2], [0.3, 0.4]]
|
61 |
-
|
62 |
-
# Test
|
63 |
-
rag_engine.add_documents(texts, metadata)
|
64 |
-
|
65 |
-
# Verify
|
66 |
-
mock_create_embeddings.assert_called_once_with(texts)
|
67 |
-
assert rag_engine.collection.add.called
|
68 |
-
|
69 |
-
def test_query(rag_engine):
|
70 |
-
"""Test querying the RAG engine."""
|
71 |
-
# Setup
|
72 |
-
rag_engine.initialize_vector_store("test_collection")
|
73 |
-
|
74 |
-
# Mock embeddings creation
|
75 |
-
with patch.object(rag_engine, 'create_embeddings') as mock_create_embeddings:
|
76 |
-
mock_create_embeddings.return_value = [[0.1, 0.2]]
|
77 |
-
|
78 |
-
# Mock vector store query
|
79 |
-
mock_results = {
|
80 |
-
'documents': [["Relevant document 1", "Relevant document 2"]],
|
81 |
-
'distances': [[0.1, 0.2]]
|
82 |
-
}
|
83 |
-
rag_engine.collection.query.return_value = mock_results
|
84 |
-
|
85 |
-
# Mock chat completion
|
86 |
-
mock_response = Mock()
|
87 |
-
mock_response.choices = [Mock(message=Mock(content="Test answer"))]
|
88 |
-
rag_engine.client.chat.completions.create.return_value = mock_response
|
89 |
-
|
90 |
-
# Test
|
91 |
-
result = rag_engine.query("Test question")
|
92 |
-
|
93 |
-
# Verify
|
94 |
-
assert isinstance(result, dict)
|
95 |
-
assert "answer" in result
|
96 |
-
assert "context" in result
|
97 |
-
assert "source_documents" in result
|
98 |
-
assert result["answer"] == "Test answer"
|
99 |
-
|
100 |
-
def test_error_handling(rag_engine):
|
101 |
-
"""Test error handling in RAG engine."""
|
102 |
-
# Test error in embeddings creation
|
103 |
-
rag_engine.client.embeddings.create.side_effect = Exception("API Error")
|
104 |
-
|
105 |
-
with pytest.raises(Exception):
|
106 |
-
rag_engine.create_embeddings(["Test"])
|
107 |
-
|
108 |
-
# Test error in vector store initialization
|
109 |
-
rag_engine.chroma_client.get_or_create_collection.side_effect = Exception("DB Error")
|
110 |
-
|
111 |
-
with pytest.raises(Exception):
|
112 |
-
rag_engine.initialize_vector_store("test")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vercel.json
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"builds": [
|
3 |
-
{
|
4 |
-
"src": "app.py",
|
5 |
-
"use": "@vercel/python"
|
6 |
-
}
|
7 |
-
],
|
8 |
-
"routes": [
|
9 |
-
{
|
10 |
-
"src": "/(.*)",
|
11 |
-
"dest": "app.py"
|
12 |
-
}
|
13 |
-
]
|
14 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vercel.txt
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
streamlit==1.29.0
|
2 |
-
openai==1.6.1
|
3 |
-
python-dotenv==1.0.0
|
4 |
-
PyPDF2==3.0.1
|
5 |
-
langchain==0.0.352
|
6 |
-
chromadb==0.3.26
|
7 |
-
pydantic==1.10.13
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|