import streamlit as st import os import tempfile from pymongo import MongoClient from datetime import datetime from pathlib import Path from document_chunker import DocumentChunker from dotenv import load_dotenv load_dotenv() # MongoDB connection mongo_uri = os.getenv("MONGO_URI") db_name = os.getenv("MONGO_DB", "grant_docs") client = MongoClient(mongo_uri) st.set_page_config(page_title="Doc Chunker", layout="wide") st.title("📄 Document Chunker & Uploader") with st.sidebar: st.header("Settings") selected_collection = st.text_input("MongoDB Collection Name", "doc_chunks_cat") is_grant_app = st.toggle("Is this a Grant Application?", value=True) if st.button("Connect to Collection"): collection = client[db_name][selected_collection] st.success(f"Connected to `{selected_collection}` in `{db_name}`") uploaded_file = st.file_uploader("Upload a DOCX or TXT file", type=["docx", "txt"]) if uploaded_file: # Save file to temp path temp_path = Path(tempfile.gettempdir()) / uploaded_file.name with open(temp_path, "wb") as f: f.write(uploaded_file.getbuffer()) st.success(f"Uploaded `{uploaded_file.name}`") # Check if file already exists in collection modified_time = datetime.now().isoformat() collection = client[db_name][selected_collection] if collection.find_one({"metadata.title": uploaded_file.name}): st.warning("⚠️ This file already exists in the collection. Skipping...") else: st.write("⏳ Processing...") chunker = DocumentChunker() chunks = chunker.process_document(str(temp_path)) if is_grant_app else chunker.process_document(str(temp_path)) if chunks: for chunk in chunks: chunk['metadata'].update({ "title": uploaded_file.name, "uploaded_at": modified_time, }) collection.insert_one(chunk) st.success(f"✅ {len(chunks)} chunks inserted into `{selected_collection}`") # Show preview for i, c in enumerate(chunks[:3]): st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}") st.markdown(c['text'][:400] + "...") st.caption(f"Topics: {', '.join(c['metadata']['topics'])} | Category: {c['metadata']['category']}") st.progress(c['metadata']['confidence_score']) if len(chunks) > 3: st.info(f"... and {len(chunks)-3} more chunks processed.") else: st.warning("⚠️ No chunks were generated.")