import os import streamlit as st import tempfile from datetime import datetime from pathlib import Path from pymongo import MongoClient from urllib.parse import quote_plus from document_chunker import DocumentChunker # === MongoDB connection via Hugging Face secrets === user = quote_plus(os.getenv("MONGO_USER")) password = quote_plus(os.getenv("MONGO_PASS")) cluster = os.getenv("MONGO_CLUSTER") db_name = os.environ.get("MONGO_DB", "grant_docs") mongo_uri = f"mongodb+srv://{user}:{password}@{cluster}/{db_name}?retryWrites=true&w=majority&tls=true&tlsAllowInvalidCertificates=true" client = MongoClient(mongo_uri, tls=True, tlsAllowInvalidCertificates=True, serverSelectionTimeoutMS=20000) db = client[db_name] # === Streamlit UI === st.set_page_config(page_title="Doc Chunker", layout="wide") st.title("📄 Document Chunker & Uploader") with st.sidebar: st.header("Settings") try: existing_collections = db.list_collection_names() existing_collections.append("Create New Collection") default_index = existing_collections.index("doc_chunks_cat") if "doc_chunks_cat" in existing_collections else 0 selected_collection = st.selectbox("Choose MongoDB Collection", existing_collections, index=default_index) except Exception as e: st.error(f"Failed to list collections: {e}") selected_collection = "doc_chunks_cat" if selected_collection == "Create New Collection": selected_collection = st.sidebar.text_input("Enter Collection Name:") if not selected_collection: st.warning("⚠️ Enter a collection name to proceed.") st.stop() is_grant_app = st.toggle("Is this a Grant Application?", value=False) uploaded_file = st.file_uploader("Upload a DOCX, TXT, or PDF file", type=["docx", "txt", "pdf"]) # === Store session state after upload === if uploaded_file and "ready_to_process" not in st.session_state: temp_path = Path(tempfile.gettempdir()) / uploaded_file.name with open(temp_path, "wb") as f: f.write(uploaded_file.getbuffer()) st.session_state["uploaded_file_name"] = uploaded_file.name st.session_state["collection_name"] = selected_collection st.session_state["is_grant_app"] = is_grant_app st.session_state["temp_path"] = str(temp_path) st.session_state["ready_to_process"] = True st.rerun() # === Process document === if st.session_state.get("ready_to_process"): file_name = st.session_state["uploaded_file_name"] collection_name = st.session_state["collection_name"] is_grant_app = st.session_state["is_grant_app"] temp_path = st.session_state["temp_path"] st.success(f"Uploaded `{file_name}`") collection = db[collection_name] if collection.find_one({"metadata.title": file_name}): st.warning("⚠️ This file already exists in the collection. Skipping...") else: st.write("⏳ Processing with DocumentChunker...") chunker = DocumentChunker() chunks = chunker.process_document(temp_path) if chunks: for chunk in chunks: chunk['metadata'].update({ "title": file_name, "uploaded_at": datetime.now().isoformat(), "is_grant_app": is_grant_app, }) collection.insert_one(chunk) st.success(f"✅ {len(chunks)} chunks inserted into `{collection_name}`") for i, c in enumerate(chunks[:3]): st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}") st.markdown(c['text'][:400] + "...") st.caption(f"Topics: {', '.join(c['metadata']['topics'])} | Category: {c['metadata']['category']}") st.progress(c['metadata']['confidence_score']) if len(chunks) > 3: st.info(f"... and {len(chunks)-3} more chunks processed.") else: st.warning("⚠️ No chunks were generated.") # Clean up try: os.remove(temp_path) except Exception as e: st.warning(f"⚠️ Could not delete temp file: {e}") # Reset session for key in ["uploaded_file_name", "collection_name", "is_grant_app", "temp_path", "ready_to_process"]: st.session_state.pop(key, None) st.rerun()