Spaces:

Tesneem
/

document_chunker

Running

File size: 2,750 Bytes

b9ae50a
23a7785
b9ae50a
 
 
 
 
 
23a7785
 
 
b9ae50a
23a7785
b9ae50a
23a7785
b9ae50a
 
 
 
 
23a7785
 
 
 
 
 
 
 
 
b9ae50a
 
 
 
 
 
 
 
 
 
 
 
23a7785
 
b9ae50a
 
 
23a7785
b9ae50a
23a7785
b9ae50a
 
 
 
 
 
23a7785
b9ae50a
 
 
 
 
23a7785
b9ae50a

import os
import streamlit as st
import tempfile
from pymongo import MongoClient
from datetime import datetime
from pathlib import Path
from document_chunker import DocumentChunker

# === MongoDB connection via Hugging Face secrets ===
mongo_uri = os.environ["MONGO_URI"]
db_name = os.environ.get("MONGO_DB", "grant_docs")
client = MongoClient(mongo_uri)
db = client[db_name]

# === Streamlit UI ===
st.set_page_config(page_title="Doc Chunker", layout="wide")
st.title("📄 Document Chunker & Uploader")

with st.sidebar:
    st.header("Settings")

    # Fetch collection names for dropdown
    try:
        existing_collections = db.list_collection_names()
        selected_collection = st.selectbox("Choose MongoDB Collection", existing_collections, index=existing_collections.index("doc_chunks_cat") if "doc_chunks_cat" in existing_collections else 0)
    except Exception as e:
        st.error(f"Failed to list collections: {e}")
        selected_collection = "doc_chunks_cat"

    is_grant_app = st.toggle("Is this a Grant Application?", value=True)

uploaded_file = st.file_uploader("Upload a DOCX or TXT file", type=["docx", "txt"])

if uploaded_file:
    temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
    with open(temp_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    st.success(f"Uploaded `{uploaded_file.name}`")

    modified_time = datetime.now().isoformat()
    collection = db[selected_collection]

    if collection.find_one({"metadata.title": uploaded_file.name}):
        st.warning("⚠️ This file already exists in the collection. Skipping...")
    else:
        st.write("⏳ Processing with DocumentChunker...")
        chunker = DocumentChunker()
        chunks = chunker.process_document(str(temp_path))

        if chunks:
            for chunk in chunks:
                chunk['metadata'].update({
                    "title": uploaded_file.name,
                    "uploaded_at": modified_time,
                    "is_grant_app": is_grant_app,
                })
                collection.insert_one(chunk)

            st.success(f"✅ {len(chunks)} chunks inserted into `{selected_collection}`")

            # Show a few previews
            for i, c in enumerate(chunks[:3]):
                st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
                st.markdown(c['text'][:400] + "...")
                st.caption(f"Topics: {', '.join(c['metadata']['topics'])} | Category: {c['metadata']['category']}")
                st.progress(c['metadata']['confidence_score'])

            if len(chunks) > 3:
                st.info(f"... and {len(chunks)-3} more chunks processed.")
        else:
            st.warning("⚠️ No chunks were generated.")