Tesneem's picture
Create app.py
b9ae50a verified
raw
history blame
2.61 kB
import streamlit as st
import os
import tempfile
from pymongo import MongoClient
from datetime import datetime
from pathlib import Path
from document_chunker import DocumentChunker
from dotenv import load_dotenv
load_dotenv()
# MongoDB connection
mongo_uri = os.getenv("MONGO_URI")
db_name = os.getenv("MONGO_DB", "grant_docs")
client = MongoClient(mongo_uri)
st.set_page_config(page_title="Doc Chunker", layout="wide")
st.title("πŸ“„ Document Chunker & Uploader")
with st.sidebar:
st.header("Settings")
selected_collection = st.text_input("MongoDB Collection Name", "doc_chunks_cat")
is_grant_app = st.toggle("Is this a Grant Application?", value=True)
if st.button("Connect to Collection"):
collection = client[db_name][selected_collection]
st.success(f"Connected to `{selected_collection}` in `{db_name}`")
uploaded_file = st.file_uploader("Upload a DOCX or TXT file", type=["docx", "txt"])
if uploaded_file:
# Save file to temp path
temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
with open(temp_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.success(f"Uploaded `{uploaded_file.name}`")
# Check if file already exists in collection
modified_time = datetime.now().isoformat()
collection = client[db_name][selected_collection]
if collection.find_one({"metadata.title": uploaded_file.name}):
st.warning("⚠️ This file already exists in the collection. Skipping...")
else:
st.write("⏳ Processing...")
chunker = DocumentChunker()
chunks = chunker.process_document(str(temp_path)) if is_grant_app else chunker.process_document(str(temp_path))
if chunks:
for chunk in chunks:
chunk['metadata'].update({
"title": uploaded_file.name,
"uploaded_at": modified_time,
})
collection.insert_one(chunk)
st.success(f"βœ… {len(chunks)} chunks inserted into `{selected_collection}`")
# Show preview
for i, c in enumerate(chunks[:3]):
st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
st.markdown(c['text'][:400] + "...")
st.caption(f"Topics: {', '.join(c['metadata']['topics'])} | Category: {c['metadata']['category']}")
st.progress(c['metadata']['confidence_score'])
if len(chunks) > 3:
st.info(f"... and {len(chunks)-3} more chunks processed.")
else:
st.warning("⚠️ No chunks were generated.")