Spaces:
Running
Running
import streamlit as st | |
import os | |
import tempfile | |
from pymongo import MongoClient | |
from datetime import datetime | |
from pathlib import Path | |
from document_chunker import DocumentChunker | |
from dotenv import load_dotenv | |
load_dotenv() | |
# MongoDB connection | |
mongo_uri = os.getenv("MONGO_URI") | |
db_name = os.getenv("MONGO_DB", "grant_docs") | |
client = MongoClient(mongo_uri) | |
st.set_page_config(page_title="Doc Chunker", layout="wide") | |
st.title("π Document Chunker & Uploader") | |
with st.sidebar: | |
st.header("Settings") | |
selected_collection = st.text_input("MongoDB Collection Name", "doc_chunks_cat") | |
is_grant_app = st.toggle("Is this a Grant Application?", value=True) | |
if st.button("Connect to Collection"): | |
collection = client[db_name][selected_collection] | |
st.success(f"Connected to `{selected_collection}` in `{db_name}`") | |
uploaded_file = st.file_uploader("Upload a DOCX or TXT file", type=["docx", "txt"]) | |
if uploaded_file: | |
# Save file to temp path | |
temp_path = Path(tempfile.gettempdir()) / uploaded_file.name | |
with open(temp_path, "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
st.success(f"Uploaded `{uploaded_file.name}`") | |
# Check if file already exists in collection | |
modified_time = datetime.now().isoformat() | |
collection = client[db_name][selected_collection] | |
if collection.find_one({"metadata.title": uploaded_file.name}): | |
st.warning("β οΈ This file already exists in the collection. Skipping...") | |
else: | |
st.write("β³ Processing...") | |
chunker = DocumentChunker() | |
chunks = chunker.process_document(str(temp_path)) if is_grant_app else chunker.process_document(str(temp_path)) | |
if chunks: | |
for chunk in chunks: | |
chunk['metadata'].update({ | |
"title": uploaded_file.name, | |
"uploaded_at": modified_time, | |
}) | |
collection.insert_one(chunk) | |
st.success(f"β {len(chunks)} chunks inserted into `{selected_collection}`") | |
# Show preview | |
for i, c in enumerate(chunks[:3]): | |
st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}") | |
st.markdown(c['text'][:400] + "...") | |
st.caption(f"Topics: {', '.join(c['metadata']['topics'])} | Category: {c['metadata']['category']}") | |
st.progress(c['metadata']['confidence_score']) | |
if len(chunks) > 3: | |
st.info(f"... and {len(chunks)-3} more chunks processed.") | |
else: | |
st.warning("β οΈ No chunks were generated.") | |