File size: 2,750 Bytes
b9ae50a
23a7785
b9ae50a
 
 
 
 
 
23a7785
 
 
b9ae50a
23a7785
b9ae50a
23a7785
b9ae50a
 
 
 
 
23a7785
 
 
 
 
 
 
 
 
b9ae50a
 
 
 
 
 
 
 
 
 
 
 
23a7785
 
b9ae50a
 
 
23a7785
b9ae50a
23a7785
b9ae50a
 
 
 
 
 
23a7785
b9ae50a
 
 
 
 
23a7785
b9ae50a
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import streamlit as st
import tempfile
from pymongo import MongoClient
from datetime import datetime
from pathlib import Path
from document_chunker import DocumentChunker

# === MongoDB connection via Hugging Face secrets ===
mongo_uri = os.environ["MONGO_URI"]
db_name = os.environ.get("MONGO_DB", "grant_docs")
client = MongoClient(mongo_uri)
db = client[db_name]

# === Streamlit UI ===
st.set_page_config(page_title="Doc Chunker", layout="wide")
st.title("📄 Document Chunker & Uploader")

with st.sidebar:
    st.header("Settings")

    # Fetch collection names for dropdown
    try:
        existing_collections = db.list_collection_names()
        selected_collection = st.selectbox("Choose MongoDB Collection", existing_collections, index=existing_collections.index("doc_chunks_cat") if "doc_chunks_cat" in existing_collections else 0)
    except Exception as e:
        st.error(f"Failed to list collections: {e}")
        selected_collection = "doc_chunks_cat"

    is_grant_app = st.toggle("Is this a Grant Application?", value=True)

uploaded_file = st.file_uploader("Upload a DOCX or TXT file", type=["docx", "txt"])

if uploaded_file:
    temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
    with open(temp_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    st.success(f"Uploaded `{uploaded_file.name}`")

    modified_time = datetime.now().isoformat()
    collection = db[selected_collection]

    if collection.find_one({"metadata.title": uploaded_file.name}):
        st.warning("⚠️ This file already exists in the collection. Skipping...")
    else:
        st.write("⏳ Processing with DocumentChunker...")
        chunker = DocumentChunker()
        chunks = chunker.process_document(str(temp_path))

        if chunks:
            for chunk in chunks:
                chunk['metadata'].update({
                    "title": uploaded_file.name,
                    "uploaded_at": modified_time,
                    "is_grant_app": is_grant_app,
                })
                collection.insert_one(chunk)

            st.success(f"✅ {len(chunks)} chunks inserted into `{selected_collection}`")

            # Show a few previews
            for i, c in enumerate(chunks[:3]):
                st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
                st.markdown(c['text'][:400] + "...")
                st.caption(f"Topics: {', '.join(c['metadata']['topics'])} | Category: {c['metadata']['category']}")
                st.progress(c['metadata']['confidence_score'])

            if len(chunks) > 3:
                st.info(f"... and {len(chunks)-3} more chunks processed.")
        else:
            st.warning("⚠️ No chunks were generated.")